@@ -1,2 +1,10 @@ | |||
# DeepGenePrior | |||
Dependencies | |||
TensorFlow >= 0.8.0 (due to prettytensor, might work with older versions of prettytensor - not tested) | |||
prettytensor | |||
numpy | |||
optionally matplotlib, seaborn for VAE images | |||
Usage | |||
To train latent-feature model (M1) run ssl_vae.py. Parameters set in same file. |
@@ -0,0 +1,37 @@ | |||
### | |||
''' | |||
Similar to M1 from https://github.com/dpkingma/nips14-ssl | |||
Original Author: S. Saemundsson | |||
Edited by: Z. Rahaie | |||
''' | |||
### | |||
import tensorflow as tf | |||
import prettytensor as pt | |||
import numpy as np | |||
import utils | |||
class FullyConnected( object ): | |||
def __init__( self, | |||
dim_output, | |||
hidden_layers, | |||
nonlinearity = tf.nn.softplus, | |||
l2loss = 0.0, | |||
name = 'FullyConnected' ): | |||
self.dim_output = dim_output | |||
self.hidden_layers = hidden_layers | |||
self.nonlinearity = nonlinearity | |||
self.l2loss = l2loss | |||
def output( self, inputs, phase = pt.Phase.train ): | |||
inputs = pt.wrap( inputs ) | |||
with pt.defaults_scope( phase = phase, activation_fn = self.nonlinearity, l2loss = self.l2loss ): | |||
for layer in self.hidden_layers: | |||
inputs = inputs.fully_connected( layer ) | |||
return inputs.fully_connected( self.dim_output, activation_fn = None ) | |||
@@ -0,0 +1,58 @@ | |||
### | |||
''' | |||
Similar to M1 from https://github.com/dpkingma/nips14-ssl | |||
Original Author: S. Saemundsson | |||
Edited by: Z. Rahaie | |||
''' | |||
### | |||
from vae import VariationalAutoencoder | |||
import numpy as np | |||
import data.asd as asd | |||
if __name__ == '__main__': | |||
num_batches = 0 | |||
dim_z = 0 | |||
epochs = 0 | |||
learning_rate = 1e-4 | |||
l2_loss = 1e-5 | |||
seed = 12345 | |||
hidden_layers_px = [ sqrt(input_size), sqrt(hidden_layers_px[0]), sqrt(hidden_layers_px[1])] | |||
hidden_layers_qz = [ sqrt(input_size), sqrt(hidden_layers_px[0]), sqrt(hidden_layers_px[2]) ] | |||
asd_path = ['nds/AutDB_ASD_cnv_dataset.txt'] | |||
#Uses anglpy module exists kingma github (linked before) to divide the dataset | |||
train_x, train_y, valid_x, valid_y, test_x, test_y = asd.load_numpy(asd_path, binarize_y=True) | |||
x_train, y_train = train_x.T, train_y.T | |||
x_valid, y_valid = valid_x.T, valid_y.T | |||
x_test, y_test = test_x.T, test_y.T | |||
dim_x = x_train.shape[1] | |||
dim_y = y_train.shape[1] | |||
VAE = VariationalAutoencoder( dim_x = dim_x, | |||
dim_z = dim_z, | |||
hidden_layers_px = hidden_layers_px, | |||
hidden_layers_qz = hidden_layers_qz, | |||
l2_loss = l2_loss ) | |||
#every n iterations (set to 0 to disable) | |||
VAE.train( x = x_train, x_valid = x_valid, epochs = epochs, num_batches = num_batches, | |||
learning_rate = learning_rate, seed = seed, stop_iter = 30, print_every = 10, draw_img = 0 ) | |||
weights_as_numpy = VAE.get_weights() | |||
output_weights_to_label = weights_as_numpy[6] | |||
scores = weights_as_numpy[0] * weights_as_numpy[1] * weights_as_numpy [2] * output_weights_to_label | |||
genes_index = [i for i,v in enumerate(scores) if v > 0] | |||
print(genes_index) | |||
@@ -0,0 +1,104 @@ | |||
### | |||
''' | |||
Similar to M1 from https://github.com/dpkingma/nips14-ssl | |||
Original Author: S. Saemundsson | |||
Edited by: Z. Rahaie | |||
''' | |||
### | |||
from genclass import GenerativeClassifier | |||
from vae import VariationalAutoencoder | |||
import numpy as np | |||
import data.asd as asd | |||
def encode_dataset( model_path, min_std = 0.0 ): | |||
VAE = VariationalAutoencoder( dim_x, dim_z ) | |||
with VAE.session: | |||
VAE.saver.restore( VAE.session, VAE_model_path ) | |||
enc_x_lab_mean, enc_x_lab_var = VAE.encode( x_lab ) | |||
enc_x_ulab_mean, enc_x_ulab_var = VAE.encode( x_ulab ) | |||
enc_x_valid_mean, enc_x_valid_var = VAE.encode( x_valid ) | |||
enc_x_test_mean, enc_x_test_var = VAE.encode( x_test ) | |||
id_x_keep = np.std( enc_x_ulab_mean, axis = 0 ) > min_std | |||
enc_x_lab_mean, enc_x_lab_var = enc_x_lab_mean[ :, id_x_keep ], enc_x_lab_var[ :, id_x_keep ] | |||
enc_x_ulab_mean, enc_x_ulab_var = enc_x_ulab_mean[ :, id_x_keep ], enc_x_ulab_var[ :, id_x_keep ] | |||
enc_x_valid_mean, enc_x_valid_var = enc_x_valid_mean[ :, id_x_keep ], enc_x_valid_var[ :, id_x_keep ] | |||
enc_x_test_mean, enc_x_test_var = enc_x_test_mean[ :, id_x_keep ], enc_x_test_var[ :, id_x_keep ] | |||
data_lab = np.hstack( [ enc_x_lab_mean, enc_x_lab_var ] ) | |||
data_ulab = np.hstack( [ enc_x_ulab_mean, enc_x_ulab_var ] ) | |||
data_valid = np.hstack( [enc_x_valid_mean, enc_x_valid_var] ) | |||
data_test = np.hstack( [enc_x_test_mean, enc_x_test_var] ) | |||
return data_lab, data_ulab, data_valid, data_test | |||
if __name__ == '__main__': | |||
num_batches = 100 #Number of minibatches in a single epoch | |||
epochs = 1001 #Number of epochs through the full dataset | |||
learning_rate = 1e-4 #Learning rate of ADAM | |||
alpha = 0.1 #Discriminatory factor (see equation (9) of http://arxiv.org/pdf/1406.5298v2.pdf) | |||
seed = 12345 #Seed for RNG | |||
#################### | |||
''' Load Dataset ''' | |||
#################### | |||
asd_path = ['nds/asd_case.txt', 'nds/asd_control.txt'] | |||
#Uses anglpy module from original paper (linked at top) to split the dataset for semi-supervised training | |||
train_x, train_y, valid_x, valid_y, test_x, test_y = mnist.load_numpy_split(mnist_path, binarize_y=True) | |||
x_l, y_l, x_u, y_u = mnist.create_semisupervised(train_x, train_y, num_lab) | |||
x_lab, y_lab = x_l.T, y_l.T | |||
x_ulab, y_ulab = x_u.T, y_u.T | |||
x_valid, y_valid = valid_x.T, valid_y.T | |||
x_test, y_test = test_x.T, test_y.T | |||
################ | |||
''' Load VAE ''' | |||
################ | |||
VAE_model_path = 'temp/mid_training_0.cpkt' | |||
min_std = 0.1 #Dimensions with std < min_std are removed before training with GC | |||
data_lab, data_ulab, data_valid, data_test = encode_dataset( VAE_model_path, min_std ) | |||
dim_x = data_lab.shape[1] / 2 | |||
dim_y = y_lab.shape[1] | |||
num_examples = data_lab.shape[0] + data_ulab.shape[0] | |||
################################### | |||
''' Train Generative Classifier ''' | |||
################################### | |||
GC = GenerativeClassifier( dim_x, dim_z, dim_y, | |||
num_examples, num_lab, num_batches, | |||
hidden_layers_px = hidden_layers_px, | |||
hidden_layers_qz = hidden_layers_qz, | |||
hidden_layers_qy = hidden_layers_qy, | |||
alpha = alpha ) | |||
GC.train( x_labelled = data_lab, y = y_lab, x_unlabelled = data_ulab, | |||
x_valid = data_valid, y_valid = y_valid, | |||
epochs = epochs, | |||
learning_rate = learning_rate, | |||
seed = seed, | |||
print_every = 10, | |||
load_path = None ) | |||
############################ | |||
''' Evaluate on Test Set ''' | |||
############################ | |||
GC_eval = GenerativeClassifier( dim_x, dim_z, dim_y, num_examples, num_lab, num_batches ) | |||
with GC_eval.session: | |||
GC_eval.saver.restore( GC_eval.session, GC.save_path ) | |||
GC_eval.predict_labels( data_test, y_test ) |
@@ -0,0 +1,66 @@ | |||
### | |||
''' | |||
Similar to M1 from https://github.com/dpkingma/nips14-ssl | |||
Original Author: S. Saemundsson | |||
Edited by: Z. Rahaie | |||
''' | |||
### | |||
import prettytensor as pt | |||
import tensorflow as tf | |||
import numpy as np | |||
logc = np.log(2.*np.pi) | |||
c = - 0.5 * np.log(2*np.pi) | |||
def tf_normal_logpdf(x, mu, log_sigma_sq): | |||
return ( - 0.5 * logc - log_sigma_sq / 2. - tf.div( tf.square( tf.sub( x, mu ) ), 2 * tf.exp( log_sigma_sq ) ) ) | |||
def tf_stdnormal_logpdf(x): | |||
return ( - 0.5 * ( logc + tf.square( x ) ) ) | |||
def tf_gaussian_ent(log_sigma_sq): | |||
return ( - 0.5 * ( logc + 1.0 + log_sigma_sq ) ) | |||
def tf_gaussian_marg(mu, log_sigma_sq): | |||
return ( - 0.5 * ( logc + ( tf.square( mu ) + tf.exp( log_sigma_sq ) ) ) ) | |||
def tf_binary_xentropy(x, y, const = 1e-10): | |||
return - ( x * tf.log ( tf.clip_by_value( y, const, 1.0 ) ) + \ | |||
(1.0 - x) * tf.log( tf.clip_by_value( 1.0 - y, const, 1.0 ) ) ) | |||
def feed_numpy_semisupervised(num_lab_batch, num_ulab_batch, x_lab, y, x_ulab): | |||
size = x_lab.shape[0] + x_ulab.shape[0] | |||
batch_size = num_lab_batch + num_ulab_batch | |||
count = int(size / batch_size) | |||
dim = x_lab.shape[1] | |||
for i in xrange(count): | |||
start_lab = i * num_lab_batch | |||
end_lab = start_lab + num_lab_batch | |||
start_ulab = i * num_ulab_batch | |||
end_ulab = start_ulab + num_ulab_batch | |||
yield [ x_lab[start_lab:end_lab,:dim/2], x_lab[start_lab:end_lab,dim/2:dim], y[start_lab:end_lab], | |||
x_ulab[start_ulab:end_ulab,:dim/2], x_ulab[start_ulab:end_ulab,dim/2:dim] ] | |||
def feed_numpy(batch_size, x): | |||
size = x.shape[0] | |||
count = int(size / batch_size) | |||
dim = x.shape[1] | |||
for i in xrange(count): | |||
start = i * batch_size | |||
end = start + batch_size | |||
yield x[start:end] | |||