# DeepGenePrior | |||||
Dependencies | |||||
TensorFlow >= 0.8.0 (due to prettytensor, might work with older versions of prettytensor - not tested) | |||||
prettytensor | |||||
numpy | |||||
optionally matplotlib, seaborn for VAE images | |||||
Usage | |||||
To train latent-feature model (M1) run ssl_vae.py. Parameters set in same file. |
### | |||||
''' | |||||
Similar to M1 from https://github.com/dpkingma/nips14-ssl | |||||
Original Author: S. Saemundsson | |||||
Edited by: Z. Rahaie | |||||
''' | |||||
### | |||||
import tensorflow as tf | |||||
import prettytensor as pt | |||||
import numpy as np | |||||
import utils | |||||
class FullyConnected( object ): | |||||
def __init__( self, | |||||
dim_output, | |||||
hidden_layers, | |||||
nonlinearity = tf.nn.softplus, | |||||
l2loss = 0.0, | |||||
name = 'FullyConnected' ): | |||||
self.dim_output = dim_output | |||||
self.hidden_layers = hidden_layers | |||||
self.nonlinearity = nonlinearity | |||||
self.l2loss = l2loss | |||||
def output( self, inputs, phase = pt.Phase.train ): | |||||
inputs = pt.wrap( inputs ) | |||||
with pt.defaults_scope( phase = phase, activation_fn = self.nonlinearity, l2loss = self.l2loss ): | |||||
for layer in self.hidden_layers: | |||||
inputs = inputs.fully_connected( layer ) | |||||
return inputs.fully_connected( self.dim_output, activation_fn = None ) | |||||
### | |||||
''' | |||||
Similar to M1 from https://github.com/dpkingma/nips14-ssl | |||||
Original Author: S. Saemundsson | |||||
Edited by: Z. Rahaie | |||||
''' | |||||
### | |||||
from vae import VariationalAutoencoder | |||||
import numpy as np | |||||
import data.asd as asd | |||||
if __name__ == '__main__': | |||||
num_batches = 0 | |||||
dim_z = 0 | |||||
epochs = 0 | |||||
learning_rate = 1e-4 | |||||
l2_loss = 1e-5 | |||||
seed = 12345 | |||||
hidden_layers_px = [ sqrt(input_size), sqrt(hidden_layers_px[0]), sqrt(hidden_layers_px[1])] | |||||
hidden_layers_qz = [ sqrt(input_size), sqrt(hidden_layers_px[0]), sqrt(hidden_layers_px[2]) ] | |||||
asd_path = ['nds/AutDB_ASD_cnv_dataset.txt'] | |||||
#Uses anglpy module exists kingma github (linked before) to divide the dataset | |||||
train_x, train_y, valid_x, valid_y, test_x, test_y = asd.load_numpy(asd_path, binarize_y=True) | |||||
x_train, y_train = train_x.T, train_y.T | |||||
x_valid, y_valid = valid_x.T, valid_y.T | |||||
x_test, y_test = test_x.T, test_y.T | |||||
dim_x = x_train.shape[1] | |||||
dim_y = y_train.shape[1] | |||||
VAE = VariationalAutoencoder( dim_x = dim_x, | |||||
dim_z = dim_z, | |||||
hidden_layers_px = hidden_layers_px, | |||||
hidden_layers_qz = hidden_layers_qz, | |||||
l2_loss = l2_loss ) | |||||
#every n iterations (set to 0 to disable) | |||||
VAE.train( x = x_train, x_valid = x_valid, epochs = epochs, num_batches = num_batches, | |||||
learning_rate = learning_rate, seed = seed, stop_iter = 30, print_every = 10, draw_img = 0 ) | |||||
weights_as_numpy = VAE.get_weights() | |||||
output_weights_to_label = weights_as_numpy[6] | |||||
scores = weights_as_numpy[0] * weights_as_numpy[1] * weights_as_numpy [2] * output_weights_to_label | |||||
genes_index = [i for i,v in enumerate(scores) if v > 0] | |||||
print(genes_index) | |||||
### | |||||
''' | |||||
Similar to M1 from https://github.com/dpkingma/nips14-ssl | |||||
Original Author: S. Saemundsson | |||||
Edited by: Z. Rahaie | |||||
''' | |||||
### | |||||
from genclass import GenerativeClassifier | |||||
from vae import VariationalAutoencoder | |||||
import numpy as np | |||||
import data.asd as asd | |||||
def encode_dataset( model_path, min_std = 0.0 ): | |||||
VAE = VariationalAutoencoder( dim_x, dim_z ) | |||||
with VAE.session: | |||||
VAE.saver.restore( VAE.session, VAE_model_path ) | |||||
enc_x_lab_mean, enc_x_lab_var = VAE.encode( x_lab ) | |||||
enc_x_ulab_mean, enc_x_ulab_var = VAE.encode( x_ulab ) | |||||
enc_x_valid_mean, enc_x_valid_var = VAE.encode( x_valid ) | |||||
enc_x_test_mean, enc_x_test_var = VAE.encode( x_test ) | |||||
id_x_keep = np.std( enc_x_ulab_mean, axis = 0 ) > min_std | |||||
enc_x_lab_mean, enc_x_lab_var = enc_x_lab_mean[ :, id_x_keep ], enc_x_lab_var[ :, id_x_keep ] | |||||
enc_x_ulab_mean, enc_x_ulab_var = enc_x_ulab_mean[ :, id_x_keep ], enc_x_ulab_var[ :, id_x_keep ] | |||||
enc_x_valid_mean, enc_x_valid_var = enc_x_valid_mean[ :, id_x_keep ], enc_x_valid_var[ :, id_x_keep ] | |||||
enc_x_test_mean, enc_x_test_var = enc_x_test_mean[ :, id_x_keep ], enc_x_test_var[ :, id_x_keep ] | |||||
data_lab = np.hstack( [ enc_x_lab_mean, enc_x_lab_var ] ) | |||||
data_ulab = np.hstack( [ enc_x_ulab_mean, enc_x_ulab_var ] ) | |||||
data_valid = np.hstack( [enc_x_valid_mean, enc_x_valid_var] ) | |||||
data_test = np.hstack( [enc_x_test_mean, enc_x_test_var] ) | |||||
return data_lab, data_ulab, data_valid, data_test | |||||
if __name__ == '__main__': | |||||
num_batches = 100 #Number of minibatches in a single epoch | |||||
epochs = 1001 #Number of epochs through the full dataset | |||||
learning_rate = 1e-4 #Learning rate of ADAM | |||||
alpha = 0.1 #Discriminatory factor (see equation (9) of http://arxiv.org/pdf/1406.5298v2.pdf) | |||||
seed = 12345 #Seed for RNG | |||||
#################### | |||||
''' Load Dataset ''' | |||||
#################### | |||||
asd_path = ['nds/asd_case.txt', 'nds/asd_control.txt'] | |||||
#Uses anglpy module from original paper (linked at top) to split the dataset for semi-supervised training | |||||
train_x, train_y, valid_x, valid_y, test_x, test_y = mnist.load_numpy_split(mnist_path, binarize_y=True) | |||||
x_l, y_l, x_u, y_u = mnist.create_semisupervised(train_x, train_y, num_lab) | |||||
x_lab, y_lab = x_l.T, y_l.T | |||||
x_ulab, y_ulab = x_u.T, y_u.T | |||||
x_valid, y_valid = valid_x.T, valid_y.T | |||||
x_test, y_test = test_x.T, test_y.T | |||||
################ | |||||
''' Load VAE ''' | |||||
################ | |||||
VAE_model_path = 'temp/mid_training_0.cpkt' | |||||
min_std = 0.1 #Dimensions with std < min_std are removed before training with GC | |||||
data_lab, data_ulab, data_valid, data_test = encode_dataset( VAE_model_path, min_std ) | |||||
dim_x = data_lab.shape[1] / 2 | |||||
dim_y = y_lab.shape[1] | |||||
num_examples = data_lab.shape[0] + data_ulab.shape[0] | |||||
################################### | |||||
''' Train Generative Classifier ''' | |||||
################################### | |||||
GC = GenerativeClassifier( dim_x, dim_z, dim_y, | |||||
num_examples, num_lab, num_batches, | |||||
hidden_layers_px = hidden_layers_px, | |||||
hidden_layers_qz = hidden_layers_qz, | |||||
hidden_layers_qy = hidden_layers_qy, | |||||
alpha = alpha ) | |||||
GC.train( x_labelled = data_lab, y = y_lab, x_unlabelled = data_ulab, | |||||
x_valid = data_valid, y_valid = y_valid, | |||||
epochs = epochs, | |||||
learning_rate = learning_rate, | |||||
seed = seed, | |||||
print_every = 10, | |||||
load_path = None ) | |||||
############################ | |||||
''' Evaluate on Test Set ''' | |||||
############################ | |||||
GC_eval = GenerativeClassifier( dim_x, dim_z, dim_y, num_examples, num_lab, num_batches ) | |||||
with GC_eval.session: | |||||
GC_eval.saver.restore( GC_eval.session, GC.save_path ) | |||||
GC_eval.predict_labels( data_test, y_test ) |
### | |||||
''' | |||||
Similar to M1 from https://github.com/dpkingma/nips14-ssl | |||||
Original Author: S. Saemundsson | |||||
Edited by: Z. Rahaie | |||||
''' | |||||
### | |||||
import prettytensor as pt | |||||
import tensorflow as tf | |||||
import numpy as np | |||||
logc = np.log(2.*np.pi) | |||||
c = - 0.5 * np.log(2*np.pi) | |||||
def tf_normal_logpdf(x, mu, log_sigma_sq): | |||||
return ( - 0.5 * logc - log_sigma_sq / 2. - tf.div( tf.square( tf.sub( x, mu ) ), 2 * tf.exp( log_sigma_sq ) ) ) | |||||
def tf_stdnormal_logpdf(x): | |||||
return ( - 0.5 * ( logc + tf.square( x ) ) ) | |||||
def tf_gaussian_ent(log_sigma_sq): | |||||
return ( - 0.5 * ( logc + 1.0 + log_sigma_sq ) ) | |||||
def tf_gaussian_marg(mu, log_sigma_sq): | |||||
return ( - 0.5 * ( logc + ( tf.square( mu ) + tf.exp( log_sigma_sq ) ) ) ) | |||||
def tf_binary_xentropy(x, y, const = 1e-10): | |||||
return - ( x * tf.log ( tf.clip_by_value( y, const, 1.0 ) ) + \ | |||||
(1.0 - x) * tf.log( tf.clip_by_value( 1.0 - y, const, 1.0 ) ) ) | |||||
def feed_numpy_semisupervised(num_lab_batch, num_ulab_batch, x_lab, y, x_ulab): | |||||
size = x_lab.shape[0] + x_ulab.shape[0] | |||||
batch_size = num_lab_batch + num_ulab_batch | |||||
count = int(size / batch_size) | |||||
dim = x_lab.shape[1] | |||||
for i in xrange(count): | |||||
start_lab = i * num_lab_batch | |||||
end_lab = start_lab + num_lab_batch | |||||
start_ulab = i * num_ulab_batch | |||||
end_ulab = start_ulab + num_ulab_batch | |||||
yield [ x_lab[start_lab:end_lab,:dim/2], x_lab[start_lab:end_lab,dim/2:dim], y[start_lab:end_lab], | |||||
x_ulab[start_ulab:end_ulab,:dim/2], x_ulab[start_ulab:end_ulab,dim/2:dim] ] | |||||
def feed_numpy(batch_size, x): | |||||
size = x.shape[0] | |||||
count = int(size / batch_size) | |||||
dim = x.shape[1] | |||||
for i in xrange(count): | |||||
start = i * batch_size | |||||
end = start + batch_size | |||||
yield x[start:end] | |||||