| @@ -0,0 +1,128 @@ | |||
| #!/usr/bin/env python | |||
| import click as ck | |||
| import numpy as np | |||
| import pandas as pd | |||
| from keras.models import load_model | |||
| from aaindex import INVALID_ACIDS | |||
| MAXLEN = 1002 | |||
| @ck.command() | |||
| @ck.option('--in-file', '-i', help='Input FASTA file', required=True) | |||
| @ck.option('--threshold', '-t', default=0.3, help='Prediction threshold') | |||
| @ck.option('--batch-size', '-bs', default=1, help='Batch size for prediction model') | |||
| @ck.option('--include-long-seq', '-ils', is_flag=True, help='Include long sequences') | |||
| @ck.option('--onto', '-go', default='bp', help='Sub-ontology to be predicted: bp, mf, cc') | |||
| def main(in_file, threshold, batch_size, include_long_seq, onto): | |||
| out_file = onto +'_results.txt' | |||
| chunk_size = 1000 | |||
| global model | |||
| global functions | |||
| ngram_df = pd.read_pickle('data/models/ngrams.pkl') | |||
| global vocab | |||
| vocab = {} | |||
| global gram_len | |||
| for key, gram in enumerate(ngram_df['ngrams']): | |||
| vocab[gram] = key + 1 | |||
| gram_len = len(ngram_df['ngrams'][0]) | |||
| print(('Gram length:', gram_len)) | |||
| print(('Vocabulary size:', len(vocab))) | |||
| model = load_model('Data/data1/models/model_%s.h5' % onto) | |||
| df = pd.read_pickle('Data/data1/models/%s.pkl' % onto) | |||
| functions = df['functions'] | |||
| w = open(out_file, 'w') | |||
| for ids, sequences in read_fasta(in_file, chunk_size, include_long_seq): | |||
| data = get_data(sequences) | |||
| results = predict(data, model, threshold, batch_size) | |||
| for i in range(len(ids)): | |||
| w.write(ids[i]) | |||
| w.write('\n') | |||
| for res in results[i]: | |||
| w.write(res) | |||
| w.write('\n') | |||
| w.close() | |||
| def is_ok(seq): | |||
| for c in seq: | |||
| if c in INVALID_ACIDS: | |||
| return False | |||
| return True | |||
| def read_fasta(filename, chunk_size, include_long_seq): | |||
| seqs = list() | |||
| info = list() | |||
| seq = '' | |||
| inf = '' | |||
| with open(filename) as f: | |||
| for line in f: | |||
| line = line.strip() | |||
| if line.startswith('>'): | |||
| if seq != '': | |||
| if is_ok(seq): | |||
| if include_long_seq: | |||
| seqs.append(seq) | |||
| info.append(inf) | |||
| if len(info) == chunk_size: | |||
| yield (info, seqs) | |||
| seqs = list() | |||
| info = list() | |||
| elif len(seq) <= MAXLEN: | |||
| seqs.append(seq) | |||
| info.append(inf) | |||
| if len(info) == chunk_size: | |||
| yield (info, seqs) | |||
| seqs = list() | |||
| info = list() | |||
| else: | |||
| print(('Ignoring sequence {} because its length > 1002' | |||
| .format(inf))) | |||
| else: | |||
| print(('Ignoring sequence {} because of ambigious AA' | |||
| .format(inf))) | |||
| seq = '' | |||
| inf = line[1:].split()[0] | |||
| else: | |||
| seq += line | |||
| seqs.append(seq) | |||
| info.append(inf) | |||
| yield (info, seqs) | |||
| def get_data(sequences): | |||
| n = len(sequences) | |||
| data = np.zeros((n, 1000), dtype=np.float32) | |||
| for i in range(len(sequences)): | |||
| seq = sequences[i] | |||
| for j in range(min(MAXLEN, len(seq)) - gram_len + 1): | |||
| data[i, j] = vocab[seq[j: (j + gram_len)]] | |||
| return data | |||
| def predict(data, model, threshold, batch_size): | |||
| n = data.shape[0] | |||
| result = list() | |||
| for i in range(n): | |||
| result.append(list()) | |||
| predictions = model.predict(data, batch_size=batch_size, verbose=1) | |||
| for i in range(n): | |||
| pred = (predictions[i] >= threshold).astype('int32') | |||
| for j in range(len(functions)): | |||
| if pred[j] == 1: | |||
| result[i].append(functions[j] + ' with score ' + '%.2f' % predictions[i][j]) | |||
| return result | |||
| if __name__ == '__main__': | |||
| main() | |||
| @@ -0,0 +1,191 @@ | |||
| from functools import partial | |||
| import numpy as np | |||
| from keras import backend as K | |||
| from keras.layers import Input, RepeatVector, Multiply, Lambda, Permute | |||
| from keras.layers.merge import _Merge | |||
| from keras.models import Model | |||
| from keras.optimizers import Adam, SGD | |||
| from keras.losses import binary_crossentropy | |||
| def wasserstein_loss(y_true, y_pred): | |||
| return K.mean(-y_true * y_pred) | |||
| def generator_recunstruction_loss(y_true, y_pred, enableTrain): | |||
| def rescale_back_labels(labels): | |||
| rescaled_back_labels = (labels / 2) + 0.5 | |||
| return rescaled_back_labels | |||
| return binary_crossentropy(rescale_back_labels(y_true), rescale_back_labels(y_pred)) #* enableTrain | |||
| global enable_train | |||
| enable_train = Input(shape = (1,)) | |||
| global generator_recunstruction_loss_new | |||
| generator_recunstruction_loss_new = partial(generator_recunstruction_loss, enableTrain = enable_train) | |||
| generator_recunstruction_loss_new.__name__ = 'generator_recunstruction_loss_new' | |||
| def gradient_penalty_loss(y_true, y_pred, averaged_samples, gradient_penalty_weight): | |||
| """Calculates the gradient penalty loss for a batch of "averaged" samples. | |||
| In Improved WGANs, the 1-Lipschitz constraint is enforced by adding a term to the loss function | |||
| that penalizes the network if the gradient norm moves away from 1. However, it is impossible to evaluate | |||
| this function at all points in the input space. The compromise used in the paper is to choose random points | |||
| on the lines between real and generated samples, and check the gradients at these points. Note that it is the | |||
| gradient w.r.t. the input averaged samples, not the weights of the discriminator, that we're penalizing! | |||
| In order to evaluate the gradients, we must first run samples through the generator and evaluate the loss. | |||
| Then we get the gradients of the discriminator w.r.t. the input averaged samples. | |||
| The l2 norm and penalty can then be calculated for this gradient. | |||
| Note that this loss function requires the original averaged samples as input, but Keras only supports passing | |||
| y_true and y_pred to loss functions. To get around this, we make a partial() of the function with the | |||
| averaged_samples argument, and use that for model training.""" | |||
| # first get the gradients: | |||
| # assuming: - that y_pred has dimensions (batch_size, 1) | |||
| # - averaged_samples has dimensions (batch_size, nbr_features) | |||
| # gradients afterwards has dimension (batch_size, nbr_features), basically | |||
| # a list of nbr_features-dimensional gradient vectors | |||
| gradients = K.gradients(y_pred, averaged_samples)[0] | |||
| # compute the euclidean norm by squaring ... | |||
| gradients_sqr = K.square(gradients) | |||
| # ... summing over the rows ... | |||
| gradients_sqr_sum = K.sum(gradients_sqr, | |||
| axis=np.arange(1, len(gradients_sqr.shape))) | |||
| # ... and sqrt | |||
| gradient_l2_norm = K.sqrt(gradients_sqr_sum) | |||
| # compute lambda * (1 - ||grad||)^2 still for each single sample | |||
| gradient_penalty = gradient_penalty_weight * K.square(1 - gradient_l2_norm) | |||
| # return the mean as loss over all the batch samples | |||
| return K.mean(gradient_penalty) | |||
| def WGAN_wrapper(generator, discriminator, generator_input_shape, discriminator_input_shape, discriminator_input_shape2, | |||
| batch_size, gradient_penalty_weight, embeddings): | |||
| BATCH_SIZE = batch_size | |||
| GRADIENT_PENALTY_WEIGHT = gradient_penalty_weight | |||
| def set_trainable_state(model, state): | |||
| for layer in model.layers: | |||
| layer.trainable = state | |||
| model.trainable = state | |||
| class RandomWeightedAverage(_Merge): | |||
| """Takes a randomly-weighted average of two tensors. In geometric terms, this outputs a random point on the line | |||
| between each pair of input points. | |||
| Inheriting from _Merge is a little messy but it was the quickest solution I could think of. | |||
| Improvements appreciated.""" | |||
| def _merge_function(self, inputs): | |||
| weights = K.random_uniform((BATCH_SIZE, 1)) | |||
| print(inputs[0]) | |||
| return (weights * inputs[0]) + ((1 - weights) * inputs[1]) | |||
| # The generator_model is used when we want to train the generator layers. | |||
| # As such, we ensure that the discriminator layers are not trainable. | |||
| # Note that once we compile this model, updating .trainable will have no effect within it. As such, it | |||
| # won't cause problems if we later set discriminator.trainable = True for the discriminator_model, as long | |||
| # as we compile the generator_model first. | |||
| def mul(input): | |||
| embedds = K.variable(np.expand_dims(embeddings.T, 0), dtype='float32') | |||
| batch_size = K.shape(input)[0] | |||
| expanded_embed = K.tile(embedds, (batch_size, 1, 1)) | |||
| expanded_input = RepeatVector(100)(input) | |||
| masked = Multiply()([expanded_embed, expanded_input]) | |||
| return K.permute_dimensions(masked, [0,2,1]) | |||
| set_trainable_state(discriminator, False) | |||
| set_trainable_state(generator, True) | |||
| #enable_train = Input(shape = (1,)) | |||
| generator_input = Input(shape=generator_input_shape) | |||
| generator_layers = generator(generator_input) | |||
| #masked_generator_layers = Lambda(mul)(generator_layers) | |||
| #discriminator_noise_in = Input(shape=(1,)) | |||
| #input_seq_g = Input(shape = discriminator_input_shape2) | |||
| discriminator_layers_for_generator= discriminator([generator_layers, generator_input]) | |||
| generator_model = Model(inputs=[generator_input, enable_train], | |||
| outputs=[generator_layers, discriminator_layers_for_generator]) | |||
| # We use the Adam paramaters from Gulrajani et al. | |||
| #global generator_recunstruction_loss_new | |||
| #generator_recunstruction_loss_new = partial(generator_recunstruction_loss, enableTrain = enable_train) | |||
| #generator_recunstruction_loss_new.__name__ = 'generator_RLN' | |||
| loss = [generator_recunstruction_loss_new, wasserstein_loss] | |||
| loss_weights = [10000, 1] | |||
| generator_model.compile(optimizer=Adam(lr=1E-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08), | |||
| loss=loss, loss_weights=loss_weights) | |||
| # Now that the generator_model is compiled, we can make the discriminator layers trainable. | |||
| set_trainable_state(discriminator, True) | |||
| set_trainable_state(generator, False) | |||
| # The discriminator_model is more complex. It takes both real image samples and random noise seeds as input. | |||
| # The noise seed is run through the generator model to get generated images. Both real and generated images | |||
| # are then run through the discriminator. Although we could concatenate the real and generated images into a | |||
| # single tensor, we don't (see model compilation for why). | |||
| real_samples = Input(shape=discriminator_input_shape) | |||
| input_seq = Input(shape = discriminator_input_shape2) | |||
| generator_input_for_discriminator = Input(shape=generator_input_shape) | |||
| generated_samples_for_discriminator = generator(generator_input_for_discriminator) | |||
| #masked_real_samples = Lambda(mul)(real_samples) | |||
| #masked_generated_samples_for_discriminator= Lambda(mul)(generated_samples_for_discriminator) | |||
| discriminator_output_from_generator = discriminator([generated_samples_for_discriminator, generator_input_for_discriminator]) | |||
| discriminator_output_from_real_samples= discriminator([real_samples, input_seq]) | |||
| # We also need to generate weighted-averages of real and generated samples, to use for the gradient norm penalty. | |||
| averaged_samples = RandomWeightedAverage()([real_samples, generated_samples_for_discriminator]) | |||
| average_seq = RandomWeightedAverage()([input_seq, generator_input_for_discriminator]) | |||
| # We then run these samples through the discriminator as well. Note that we never really use the discriminator | |||
| # output for these samples - we're only running them to get the gradient norm for the gradient penalty loss. | |||
| #print('hehehe') | |||
| #print(averaged_samples) | |||
| #masked_averaged_samples = Lambda(mul)(averaged_samples) | |||
| averaged_samples_out = discriminator([averaged_samples, average_seq]) | |||
| # The gradient penalty loss function requires the input averaged samples to get gradients. However, | |||
| # Keras loss functions can only have two arguments, y_true and y_pred. We get around this by making a partial() | |||
| # of the function with the averaged samples here. | |||
| partial_gp_loss = partial(gradient_penalty_loss, | |||
| averaged_samples=averaged_samples, | |||
| gradient_penalty_weight=GRADIENT_PENALTY_WEIGHT) | |||
| partial_gp_loss.__name__ = 'gradient_penalty' # Functions need names or Keras will throw an error | |||
| # Keras requires that inputs and outputs have the same number of samples. This is why we didn't concatenate the | |||
| # real samples and generated samples before passing them to the discriminator: If we had, it would create an | |||
| # output with 2 * BATCH_SIZE samples, while the output of the "averaged" samples for gradient penalty | |||
| # would have only BATCH_SIZE samples. | |||
| # If we don't concatenate the real and generated samples, however, we get three outputs: One of the generated | |||
| # samples, one of the real samples, and one of the averaged samples, all of size BATCH_SIZE. This works neatly! | |||
| discriminator_model = Model(inputs=[real_samples, generator_input_for_discriminator, input_seq], | |||
| outputs=[discriminator_output_from_real_samples, | |||
| discriminator_output_from_generator, | |||
| averaged_samples_out]) | |||
| loss_weights2 = [1, 1, 1] | |||
| # We use the Adam paramaters from Gulrajani et al. We use the Wasserstein loss for both the real and generated | |||
| # samples, and the gradient penalty loss for the averaged samples. | |||
| discriminator_model.compile(optimizer=SGD(clipvalue=0.01), # Adam(lr=4E-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08), | |||
| loss=[wasserstein_loss, | |||
| wasserstein_loss, | |||
| partial_gp_loss], loss_weights = loss_weights2) | |||
| # set_trainable_state(discriminator, True) | |||
| # set_trainable_state(generator, True) | |||
| return generator_model, discriminator_model | |||
| @@ -0,0 +1,193 @@ | |||
| from functools import partial | |||
| import numpy as np | |||
| from keras import backend as K | |||
| from keras.layers import Input, RepeatVector, Multiply, Lambda, Permute | |||
| from keras.layers.merge import _Merge | |||
| from keras.models import Model | |||
| from keras.optimizers import Adam, SGD | |||
| from keras.losses import binary_crossentropy | |||
| def wasserstein_loss(y_true, y_pred): | |||
| """Calculates the Wasserstein loss for a sample batch. | |||
| The Wasserstein loss function is very simple to calculate. In a standard GAN, the discriminator | |||
| has a sigmoid output, representing the probability that samples are real or generated. In Wasserstein | |||
| GANs, however, the output is linear with no activation function! Instead of being constrained to [0, 1], | |||
| the discriminator wants to make the distance between its output for real and generated samples as large as possible. | |||
| The most natural way to achieve this is to label generated samples -1 and real samples 1, instead of the | |||
| 0 and 1 used in normal GANs, so that multiplying the outputs by the labels will give you the loss immediately. | |||
| Note that the nature of this loss means that it can be (and frequently will be) less than 0.""" | |||
| return K.mean(-y_true * y_pred) | |||
| def generator_recunstruction_loss(y_true, y_pred, enableTrain): | |||
| def rescale_back_labels(labels): | |||
| rescaled_back_labels = (labels / 2) + 0.5 | |||
| return rescaled_back_labels | |||
| return binary_crossentropy(rescale_back_labels(y_true), rescale_back_labels(y_pred)) # * enableTrain | |||
| global enable_train | |||
| enable_train = Input(shape=(1,)) | |||
| global generator_recunstruction_loss_new | |||
| generator_recunstruction_loss_new = partial(generator_recunstruction_loss, enableTrain=enable_train) | |||
| generator_recunstruction_loss_new.__name__ = 'generator_recunstruction_loss_new' | |||
| def gradient_penalty_loss(y_true, y_pred, averaged_samples, gradient_penalty_weight): | |||
| """Calculates the gradient penalty loss for a batch of "averaged" samples. | |||
| In Improved WGANs, the 1-Lipschitz constraint is enforced by adding a term to the loss function | |||
| that penalizes the network if the gradient norm moves away from 1. However, it is impossible to evaluate | |||
| this function at all points in the input space. The compromise used in the paper is to choose random points | |||
| on the lines between real and generated samples, and check the gradients at these points. Note that it is the | |||
| gradient w.r.t. the input averaged samples, not the weights of the discriminator, that we're penalizing! | |||
| In order to evaluate the gradients, we must first run samples through the generator and evaluate the loss. | |||
| Then we get the gradients of the discriminator w.r.t. the input averaged samples. | |||
| The l2 norm and penalty can then be calculated for this gradient. | |||
| Note that this loss function requires the original averaged samples as input, but Keras only supports passing | |||
| y_true and y_pred to loss functions. To get around this, we make a partial() of the function with the | |||
| averaged_samples argument, and use that for model training.""" | |||
| # first get the gradients: | |||
| # assuming: - that y_pred has dimensions (batch_size, 1) | |||
| # - averaged_samples has dimensions (batch_size, nbr_features) | |||
| # gradients afterwards has dimension (batch_size, nbr_features), basically | |||
| # a list of nbr_features-dimensional gradient vectors | |||
| gradients = K.gradients(y_pred, averaged_samples)[0] | |||
| # compute the euclidean norm by squaring ... | |||
| gradients_sqr = K.square(gradients) | |||
| # ... summing over the rows ... | |||
| gradients_sqr_sum = K.sum(gradients_sqr, | |||
| axis=np.arange(1, len(gradients_sqr.shape))) | |||
| # ... and sqrt | |||
| gradient_l2_norm = K.sqrt(gradients_sqr_sum) | |||
| # compute lambda * (1 - ||grad||)^2 still for each single sample | |||
| gradient_penalty = gradient_penalty_weight * K.square(1 - gradient_l2_norm) | |||
| # return the mean as loss over all the batch samples | |||
| return K.mean(gradient_penalty) | |||
| def WGAN_wrapper(generator, discriminator, generator_input_shape, discriminator_input_shape, discriminator_input_shape2, | |||
| batch_size, gradient_penalty_weight): | |||
| BATCH_SIZE = batch_size | |||
| GRADIENT_PENALTY_WEIGHT = gradient_penalty_weight | |||
| def set_trainable_state(model, state): | |||
| for layer in model.layers: | |||
| layer.trainable = state | |||
| model.trainable = state | |||
| class RandomWeightedAverage(_Merge): | |||
| """Takes a randomly-weighted average of two tensors. In geometric terms, this outputs a random point on the line | |||
| between each pair of input points. | |||
| Inheriting from _Merge is a little messy but it was the quickest solution I could think of. | |||
| Improvements appreciated.""" | |||
| def _merge_function(self, inputs): | |||
| weights = K.random_uniform((BATCH_SIZE, 1)) | |||
| print(inputs[0]) | |||
| return (weights * inputs[0]) + ((1 - weights) * inputs[1]) | |||
| # The generator_model is used when we want to train the generator layers. | |||
| # As such, we ensure that the discriminator layers are not trainable. | |||
| # Note that once we compile this model, updating .trainable will have no effect within it. As such, it | |||
| # won't cause problems if we later set discriminator.trainable = True for the discriminator_model, as long | |||
| # as we compile the generator_model first. | |||
| set_trainable_state(discriminator, False) | |||
| set_trainable_state(generator, True) | |||
| # enable_train = Input(shape = (1,)) | |||
| generator_input = Input(shape=generator_input_shape) | |||
| generator_layers = generator(generator_input) | |||
| # masked_generator_layers = Lambda(mul)(generator_layers) | |||
| # discriminator_noise_in = Input(shape=(1,)) | |||
| # input_seq_g = Input(shape = discriminator_input_shape2) | |||
| discriminator_layers_for_generator= discriminator([generator_layers, generator_input]) | |||
| generator_model = Model(inputs=[generator_input, enable_train], | |||
| outputs=[generator_layers, discriminator_layers_for_generator]) | |||
| # We use the Adam paramaters from Gulrajani et al. | |||
| # global generator_recunstruction_loss_new | |||
| # generator_recunstruction_loss_new = partial(generator_recunstruction_loss, enableTrain = enable_train) | |||
| # generator_recunstruction_loss_new.__name__ = 'generator_RLN' | |||
| loss = [generator_recunstruction_loss_new, wasserstein_loss] | |||
| loss_weights = [0, 1] | |||
| generator_model.compile(optimizer=Adam(lr=1E-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08), | |||
| loss=loss, loss_weights=loss_weights) | |||
| # Now that the generator_model is compiled, we can make the discriminator layers trainable. | |||
| set_trainable_state(discriminator, True) | |||
| set_trainable_state(generator, False) | |||
| # The discriminator_model is more complex. It takes both real image samples and random noise seeds as input. | |||
| # The noise seed is run through the generator model to get generated images. Both real and generated images | |||
| # are then run through the discriminator. Although we could concatenate the real and generated images into a | |||
| # single tensor, we don't (see model compilation for why). | |||
| real_samples = Input(shape=discriminator_input_shape) | |||
| input_seq = Input(shape=discriminator_input_shape2) | |||
| generator_input_for_discriminator = Input(shape=generator_input_shape) | |||
| generated_samples_for_discriminator = generator(generator_input_for_discriminator) | |||
| # masked_real_samples = Lambda(mul)(real_samples) | |||
| # masked_generated_samples_for_discriminator= Lambda(mul)(generated_samples_for_discriminator) | |||
| discriminator_output_from_generator = discriminator( | |||
| [generated_samples_for_discriminator, generator_input_for_discriminator]) | |||
| discriminator_output_from_real_samples= discriminator([real_samples, input_seq]) | |||
| # We also need to generate weighted-averages of real and generated samples, to use for the gradient norm penalty. | |||
| averaged_samples = RandomWeightedAverage()([real_samples, generated_samples_for_discriminator]) | |||
| average_seq = RandomWeightedAverage()([input_seq, generator_input_for_discriminator]) | |||
| # We then run these samples through the discriminator as well. Note that we never really use the discriminator | |||
| # output for these samples - we're only running them to get the gradient norm for the gradient penalty loss. | |||
| # print('hehehe') | |||
| # print(averaged_samples) | |||
| # masked_averaged_samples = Lambda(mul)(averaged_samples) | |||
| averaged_samples_out = discriminator([averaged_samples, average_seq]) | |||
| # The gradient penalty loss function requires the input averaged samples to get gradients. However, | |||
| # Keras loss functions can only have two arguments, y_true and y_pred. We get around this by making a partial() | |||
| # of the function with the averaged samples here. | |||
| partial_gp_loss = partial(gradient_penalty_loss, | |||
| averaged_samples=averaged_samples, | |||
| gradient_penalty_weight=GRADIENT_PENALTY_WEIGHT) | |||
| partial_gp_loss.__name__ = 'gradient_penalty' # Functions need names or Keras will throw an error | |||
| # Keras requires that inputs and outputs have the same number of samples. This is why we didn't concatenate the | |||
| # real samples and generated samples before passing them to the discriminator: If we had, it would create an | |||
| # output with 2 * BATCH_SIZE samples, while the output of the "averaged" samples for gradient penalty | |||
| # would have only BATCH_SIZE samples. | |||
| # If we don't concatenate the real and generated samples, however, we get three outputs: One of the generated | |||
| # samples, one of the real samples, and one of the averaged samples, all of size BATCH_SIZE. This works neatly! | |||
| discriminator_model = Model(inputs=[real_samples, generator_input_for_discriminator, input_seq], | |||
| outputs=[discriminator_output_from_real_samples, | |||
| discriminator_output_from_generator, | |||
| averaged_samples_out]) | |||
| loss_weights2 = [1, 1, 1] | |||
| # We use the Adam paramaters from Gulrajani et al. We use the Wasserstein loss for both the real and generated | |||
| # samples, and the gradient penalty loss for the averaged samples. | |||
| discriminator_model.compile(optimizer=SGD(clipvalue=0.01), # Adam(lr=4E-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08) | |||
| loss=[wasserstein_loss, | |||
| wasserstein_loss, | |||
| partial_gp_loss], loss_weights=loss_weights2) | |||
| # set_trainable_state(discriminator, True) | |||
| # set_trainable_state(generator, True) | |||
| return generator_model, discriminator_model | |||
| @@ -0,0 +1,868 @@ | |||
| #!/usr/bin/env python | |||
| """ | |||
| """ | |||
| from __future__ import division | |||
| import logging | |||
| import sys | |||
| import time | |||
| from collections import deque | |||
| from multiprocessing import Pool | |||
| import click as ck | |||
| import numpy as np | |||
| import pandas as pd | |||
| import tensorflow as tf | |||
| from keras import backend as K | |||
| from keras.callbacks import EarlyStopping, ModelCheckpoint | |||
| from keras.layers import ( | |||
| Dense, Input, SpatialDropout1D, Conv1D, MaxPooling1D, AveragePooling1D, Multiply,GaussianNoise, | |||
| Flatten, Concatenate, Add, Maximum, Embedding, BatchNormalization, Activation, Dropout) | |||
| from keras.losses import binary_crossentropy | |||
| from keras.models import Sequential, Model, load_model | |||
| from keras.preprocessing import sequence | |||
| from scipy.spatial import distance | |||
| from sklearn.metrics import log_loss | |||
| from sklearn.metrics import roc_curve, auc, matthews_corrcoef | |||
| from keras.layers import Lambda, LeakyReLU, RepeatVector | |||
| from sklearn.metrics import precision_recall_curve | |||
| from keras.callbacks import TensorBoard | |||
| from utils import ( | |||
| get_gene_ontology, | |||
| get_go_set, | |||
| get_anchestors, | |||
| get_parents, | |||
| DataGenerator, | |||
| FUNC_DICT, | |||
| get_height) | |||
| from conditional_wgan_wrapper_exp1 import WGAN_wrapper, wasserstein_loss, generator_recunstruction_loss_new | |||
| config = tf.ConfigProto() | |||
| config.gpu_options.allow_growth = True | |||
| sess = tf.Session(config=config) | |||
| K.set_session(sess) | |||
| logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO) | |||
| sys.setrecursionlimit(100000) | |||
| DATA_ROOT = 'Data/data1/' | |||
| MAXLEN = 1000 | |||
| REPLEN = 256 | |||
| ind = 0 | |||
| @ck.command() | |||
| @ck.option( | |||
| '--function', | |||
| default='bp', | |||
| help='Ontology id (mf, bp, cc)') | |||
| @ck.option( | |||
| '--device', | |||
| default='gpu:0', | |||
| help='GPU or CPU device id') | |||
| @ck.option( | |||
| '--org', | |||
| default= None, | |||
| help='Organism id for filtering test set') | |||
| @ck.option('--train', default=True, is_flag=True) | |||
| def main(function, device, org, train): | |||
| global FUNCTION | |||
| FUNCTION = function | |||
| global GO_ID | |||
| GO_ID = FUNC_DICT[FUNCTION] | |||
| global go | |||
| go = get_gene_ontology('go.obo') | |||
| global ORG | |||
| ORG = org | |||
| func_df = pd.read_pickle(DATA_ROOT + FUNCTION + '.pkl') | |||
| global functions | |||
| functions = func_df['functions'].values | |||
| global embedds | |||
| embedds = load_go_embedding(functions) | |||
| global all_functions | |||
| all_functions = get_go_set(go, GO_ID) | |||
| logging.info('Functions: %s %d' % (FUNCTION, len(functions))) | |||
| if ORG is not None: | |||
| logging.info('Organism %s' % ORG) | |||
| global node_names | |||
| node_names = set() | |||
| with tf.device('/' + device): | |||
| params = { | |||
| 'fc_output': 1024, | |||
| 'embedding_dims': 128, | |||
| 'embedding_dropout': 0.2, | |||
| 'nb_conv': 2, | |||
| 'nb_dense': 1, | |||
| 'filter_length': 128, | |||
| 'nb_filter': 32, | |||
| 'pool_length': 64, | |||
| 'stride': 32 | |||
| } | |||
| model(params, is_train=train) | |||
| def load_go_embedding(functions): | |||
| f_embedding = open('data/swiss/term-graph-cellular-component-100d.emb', 'r') | |||
| f_index = open('data/swiss/term-id-cellular-component.txt','r') | |||
| embeddings = f_embedding.readlines() | |||
| index = f_index.readlines() | |||
| dict1 ={} | |||
| for i in range(1, len(index)): | |||
| ind = map(str, index[i].split()) | |||
| dict1[ind[0]] = ind[2] | |||
| dict2={} | |||
| for i in range(1, len(embeddings)): | |||
| embedding = map(str, embeddings[i].split()) | |||
| dict2[str(embedding[0])] = embedding[1:] | |||
| embedds = [] | |||
| z = 0 | |||
| for i in range(len(functions)): | |||
| term = functions[i] | |||
| if term in dict1.keys(): | |||
| term_index = dict1[term] | |||
| term_embedd = dict2[str(term_index)] | |||
| else: | |||
| term_embedd = np.ones([100]) | |||
| embedds.append(term_embedd) | |||
| return np.array(embedds) | |||
| def load_data(): | |||
| df = pd.read_pickle(DATA_ROOT + 'train' + '-' + FUNCTION + '.pkl') | |||
| n = len(df) | |||
| index = df.index.values | |||
| valid_n = int(n * 0.8) | |||
| train_df = df.loc[index[:valid_n]] | |||
| valid_df = df.loc[index[valid_n:]] | |||
| test_df = pd.read_pickle(DATA_ROOT + 'test' + '-' + FUNCTION + '.pkl') | |||
| print( test_df['orgs'] ) | |||
| if ORG is not None: | |||
| logging.info('Unfiltered test size: %d' % len(test_df)) | |||
| test_df = test_df[test_df['orgs'] == ORG] | |||
| logging.info('Filtered test size: %d' % len(test_df)) | |||
| def reshape(values): | |||
| values = np.hstack(values).reshape( | |||
| len(values), len(values[0])) | |||
| return values | |||
| def normalize_minmax(values): | |||
| mn = np.min(values) | |||
| mx = np.max(values) | |||
| if mx - mn != 0.0: | |||
| return (values - mn) / (mx - mn) | |||
| return values - mn | |||
| def get_values(data_frame): | |||
| print(data_frame['labels'].values.shape) | |||
| labels = reshape(data_frame['labels'].values) | |||
| ngrams = sequence.pad_sequences( | |||
| data_frame['ngrams'].values, maxlen=MAXLEN) | |||
| ngrams = reshape(ngrams) | |||
| rep = reshape(data_frame['embeddings'].values) | |||
| data = ngrams | |||
| return data, labels | |||
| train = get_values(train_df) | |||
| valid = get_values(valid_df) | |||
| test = get_values(test_df) | |||
| return train, valid, test, train_df, valid_df, test_df | |||
| def load_data0(): | |||
| df = pd.read_pickle(DATA_ROOT + 'train' + '-' + FUNCTION + '.pkl') | |||
| n = len(df) | |||
| index = df.index.values | |||
| valid_n = int(n * 0.8) | |||
| train_df = df.loc[index[:valid_n]] | |||
| valid_df = df.loc[index[valid_n:]] | |||
| test_df = pd.read_pickle(DATA_ROOT + 'test' + '-' + FUNCTION + '.pkl') | |||
| print( test_df['orgs'] ) | |||
| if ORG is not None: | |||
| logging.info('Unfiltered test size: %d' % len(test_df)) | |||
| test_df = test_df[test_df['orgs'] == ORG] | |||
| logging.info('Filtered test size: %d' % len(test_df)) | |||
| def reshape(values): | |||
| values = np.hstack(values).reshape( | |||
| len(values), len(values[0])) | |||
| return values | |||
| def normalize_minmax(values): | |||
| mn = np.min(values) | |||
| mx = np.max(values) | |||
| if mx - mn != 0.0: | |||
| return (values - mn) / (mx - mn) | |||
| return values - mn | |||
| def get_values(data_frame, p_list): | |||
| accesion_numbers = data_frame['accessions'].values | |||
| all_labels = data_frame['labels'].values | |||
| all_ngrams = reshape(sequence.pad_sequences(data_frame['ngrams'].values, maxlen=MAXLEN) ) | |||
| print(len(all_labels)) | |||
| print(len(p_list)) | |||
| labels = [] | |||
| data = [] | |||
| for i in range(accesion_numbers.shape[0]): | |||
| if accesion_numbers[i] in p_list: | |||
| labels.append(all_labels[i]) | |||
| data.append(all_ngrams[i]) | |||
| return np.asarray(data), np.asarray(labels) | |||
| dev_F = pd.read_csv('dev-' + FUNCTION + '-input.tsv', sep='\t', header=0, index_col=0) | |||
| test_F = pd.read_csv('test-' + FUNCTION + '-input.tsv', sep='\t', header=0, index_col=0) | |||
| train_F = pd.read_csv('train-' + FUNCTION + '-input.tsv', sep='\t', header=0, index_col=0) | |||
| train_l = train_F.index.tolist() | |||
| test_l = test_F.index.tolist() | |||
| dev_l = dev_F.index.tolist() | |||
| train = get_values(train_df, train_l) | |||
| valid = get_values(valid_df, dev_l) | |||
| test = get_values(test_df, test_l) | |||
| return train, valid, test, train_df, valid_df, test_df | |||
| class TrainValTensorBoard(TensorBoard): | |||
| def __init__(self, log_dir='./t_logs', **kwargs): | |||
| # Make the original `TensorBoard` log to a subdirectory 'training' | |||
| training_log_dir = log_dir | |||
| super(TrainValTensorBoard, self).__init__(training_log_dir, **kwargs) | |||
| # Log the validation metrics to a separate subdirectory | |||
| self.val_log_dir = './v_logs' | |||
| self.dis_log_dir = './d_logs' | |||
| self.gen_log_dir = './g_logs' | |||
| def set_model(self, model): | |||
| # Setup writer for validation metrics | |||
| self.val_writer = tf.summary.FileWriter(self.val_log_dir) | |||
| self.dis_writer = tf.summary.FileWriter(self.dis_log_dir) | |||
| self.gen_writer = tf.summary.FileWriter(self.gen_log_dir) | |||
| super(TrainValTensorBoard, self).set_model(model) | |||
| def on_epoch_end(self, epoch, logs=None): | |||
| # Pop the validation logs and handle them separately with | |||
| # `self.val_writer`. Also rename the keys so that they can | |||
| # be plotted on the same figure with the training metrics | |||
| logs = logs or {} | |||
| val_logs = {k.replace('val_', 'v_'): v for k, v in logs.items() if k.startswith('val_')} | |||
| for name, value in val_logs.items(): | |||
| summary = tf.Summary() | |||
| summary_value = summary.value.add() | |||
| summary_value.simple_value = value.item() | |||
| summary_value.tag = name | |||
| self.val_writer.add_summary(summary, epoch) | |||
| self.val_writer.flush() | |||
| logs = logs or {} | |||
| dis_logs = {k.replace('discriminator_', 'd_'): v for k, v in logs.items() if k.startswith('discriminator_')} | |||
| for name, value in dis_logs.items(): | |||
| summary = tf.Summary() | |||
| summary_value = summary.value.add() | |||
| summary_value.simple_value = value.item() | |||
| summary_value.tag = name | |||
| self.dis_writer.add_summary(summary, epoch) | |||
| self.dis_writer.flush() | |||
| logs = logs or {} | |||
| gen_logs = {k.replace('generator_', 'g_'): v for k, v in logs.items() if k.startswith('generator_')} | |||
| for name, value in gen_logs.items(): | |||
| summary = tf.Summary() | |||
| summary_value = summary.value.add() | |||
| summary_value.simple_value = value.item() | |||
| summary_value.tag = name | |||
| self.gen_writer.add_summary(summary, epoch) | |||
| self.gen_writer.flush() | |||
| # Pass the remaining logs to `TensorBoard.on_epoch_end` | |||
| t_logs = {k: v for k, v in logs.items() if not k.startswith('val_')} | |||
| tr_logs = {k: v for k, v in t_logs.items() if not k.startswith('discriminator_')} | |||
| tra_logs = {k: v for k, v in tr_logs.items() if not k.startswith('generator_')} | |||
| super(TrainValTensorBoard, self).on_epoch_end(epoch, tra_logs) | |||
| def on_train_end(self, logs=None): | |||
| super(TrainValTensorBoard, self).on_train_end(logs) | |||
| self.val_writer.close() | |||
| self.gen_writer.close() | |||
| self.dis_writer.close() | |||
| def get_feature_model(params): | |||
| embedding_dims = params['embedding_dims'] | |||
| max_features = 8001 | |||
| model = Sequential() | |||
| model.add(Embedding( | |||
| max_features, | |||
| embedding_dims, | |||
| input_length=MAXLEN)) | |||
| model.add(SpatialDropout1D(0.4)) | |||
| model.add(Conv1D( | |||
| padding="valid", | |||
| strides=1, | |||
| filters=params['nb_filter'], | |||
| kernel_size=params['filter_length'])) | |||
| model.add(LeakyReLU()) | |||
| #model.add(BatchNormalization()) | |||
| model.add(Conv1D( | |||
| padding="valid", | |||
| strides=1, | |||
| filters=params['nb_filter'], | |||
| kernel_size=params['filter_length'])) | |||
| model.add(LeakyReLU()) | |||
| # model.add(BatchNormalization()) problem in loading the best model | |||
| model.add(AveragePooling1D(strides=params['stride'], pool_size=params['pool_length'])) | |||
| model.add(Flatten()) | |||
| return model | |||
| #def get_node_name(go_id, unique=False): | |||
| # name = go_id.split(':')[1] | |||
| # if not unique: | |||
| # return name | |||
| # if name not in node_names: | |||
| # node_names.add(name) | |||
| # return name | |||
| # i = 1 | |||
| # while (name + '_' + str(i)) in node_names: | |||
| # i += 1 | |||
| # name = name + '_' + str(i) | |||
| # node_names.add(name) | |||
| # return name | |||
| #def get_layers(inputs): | |||
| # q = deque() | |||
| # layers = {} | |||
| # name = get_node_name(GO_ID) | |||
| # layers[GO_ID] = {'net': inputs} | |||
| # for node_id in go[GO_ID]['children']: | |||
| # if node_id in func_set: | |||
| # q.append((node_id, inputs)) | |||
| # while len(q) > 0: | |||
| # node_id, net = q.popleft() | |||
| # name = get_node_name(node_id) | |||
| # net, output = get_function_node(name, inputs) | |||
| # if node_id not in layers: | |||
| # layers[node_id] = {'net': net, 'output': output} | |||
| # for n_id in go[node_id]['children']: | |||
| # if n_id in func_set and n_id not in layers: | |||
| # ok = True | |||
| # for p_id in get_parents(go, n_id): | |||
| # if p_id in func_set and p_id not in layers: | |||
| # ok = False | |||
| # if ok: | |||
| # q.append((n_id, net)) | |||
| # for node_id in functions: | |||
| # childs = set(go[node_id]['children']).intersection(func_set) | |||
| # if len(childs) > 0: | |||
| # outputs = [layers[node_id]['output']] | |||
| # for ch_id in childs: | |||
| # outputs.append(layers[ch_id]['output']) | |||
| # name = get_node_name(node_id) + '_max' | |||
| # layers[node_id]['output'] = Maximum(name=name)(outputs) | |||
| # return layers | |||
| #def get_function_node(name, inputs): | |||
| # output_name = name + '_out' | |||
| # output = Dense(1, name=output_name, activation='sigmoid')(inputs) | |||
| # return output, output | |||
| def get_generator(params, n_classes): | |||
| inputs = Input(shape=(MAXLEN,), dtype='int32', name='input1') | |||
| feature_model = get_feature_model(params)(inputs) | |||
| net = Dense(200)(feature_model) #bp(dis1 g1) 400 - cc 200 - mf 200 | |||
| net = BatchNormalization()(net) | |||
| net = LeakyReLU()(net) | |||
| #To impose DAG hierarchy directly to the model (inspired by DeepGO) | |||
| #layers = get_layers(net) | |||
| #output_models = [] | |||
| #for i in range(len(functions)): | |||
| # output_models.append(layers[functions[i]]['output']) | |||
| #net = Concatenate(axis=1)(output_models) | |||
| output = Dense(n_classes, activation='tanh')(net) #'sigmoid' | |||
| model = Model(inputs=inputs, outputs=output) | |||
| return model | |||
| def get_discriminator(params, n_classes, dropout_rate=0.5): | |||
| inputs = Input(shape=(n_classes, )) | |||
| #n_inputs = GaussianNoise(0.1)(inputs) | |||
| #x = Conv1D(filters=8, kernel_size=128, padding='valid', strides=1)(n_inputs) | |||
| #x = BatchNormalization()(x) | |||
| #x = LeakyReLU()(x) | |||
| #x = Conv1D(filters=8, kernel_size=128, padding='valid', strides=1)(x) | |||
| #x = BatchNormalization()(x) | |||
| #x = LeakyReLU()(x) | |||
| #x = AveragePooling1D(strides=params['stride'], pool_size=params['pool_length'])(x) | |||
| #x = Flatten()(x) | |||
| #x = Lambda(lambda y: K.squeeze(y, 2))(x) | |||
| inputs2 = Input(shape =(MAXLEN,), dtype ='int32', name='d_input2') | |||
| x2 = Embedding(8001,128, input_length=MAXLEN)(inputs2) | |||
| x2 = SpatialDropout1D(0.4)(x2) | |||
| x2 = Conv1D(filters=8, kernel_size=128, padding='valid', strides=1)(x2) | |||
| x2 = BatchNormalization()(x2) | |||
| x2 = LeakyReLU()(x2) | |||
| #x2 = Conv1D(filters=1, kernel_size=1, padding='valid', strides=1)(x2) | |||
| #x2 = BatchNormalization()(x2) | |||
| #x2 = LeakyReLU()(x2) | |||
| #x2 = AveragePooling1D(strides=params['stride'], pool_size=params['pool_length'])(x2) | |||
| x2 = Flatten()(x2) | |||
| size = 100 | |||
| x2 = Dropout(dropout_rate)(x2) | |||
| x2 = Dense(size)(x2) | |||
| x2 = BatchNormalization()(x2) | |||
| x2 = LeakyReLU()(x2) | |||
| size = 100 | |||
| x = Dropout(dropout_rate)(inputs) | |||
| x = Dense(size)(x) | |||
| x = BatchNormalization()(x) | |||
| x = LeakyReLU()(x) | |||
| x = Concatenate(axis=1, name='merged2')([x, x2]) | |||
| layer_sizes = [30, 30, 30, 30, 30, 30] | |||
| for size in layer_sizes: | |||
| x = Dropout(dropout_rate)(x) | |||
| x = Dense(size)(x) | |||
| x = BatchNormalization()(x) | |||
| x = LeakyReLU()(x) | |||
| outputs = Dense(1)(x) | |||
| model = Model(inputs = [inputs, inputs2], outputs=[outputs], name='Discriminator') | |||
| return model | |||
| def rescale_labels(labels): | |||
| rescaled_labels = 2*(labels - 0.5) | |||
| return rescaled_labels | |||
| def rescale_back_labels(labels): | |||
| rescaled_back_labels =(labels/2)+0.5 | |||
| return rescaled_back_labels | |||
| def get_model(params,nb_classes, batch_size, GRADIENT_PENALTY_WEIGHT=10): | |||
| generator = get_generator(params, nb_classes) | |||
| discriminator = get_discriminator(params, nb_classes) | |||
| generator_model, discriminator_model = \ | |||
| WGAN_wrapper(generator=generator, | |||
| discriminator=discriminator, | |||
| generator_input_shape=(MAXLEN,), | |||
| discriminator_input_shape=(nb_classes,), | |||
| discriminator_input_shape2 = (MAXLEN, ), | |||
| batch_size=batch_size, | |||
| gradient_penalty_weight=GRADIENT_PENALTY_WEIGHT, | |||
| embeddings=embedds) | |||
| logging.info('Compilation finished') | |||
| return generator_model, discriminator_model | |||
| def train_wgan(generator_model, discriminator_model, batch_size, epochs, | |||
| x_train, y_train, x_val, y_val, x_test, y_test, generator_model_path, discriminator_model_path, | |||
| TRAINING_RATIO=10, N_WARM_UP=20): | |||
| BATCH_SIZE = batch_size | |||
| N_EPOCH = epochs | |||
| positive_y = np.ones((batch_size, 1), dtype=np.float32) | |||
| zero_y = positive_y * 0 | |||
| negative_y = -positive_y | |||
| positive_full_y = np.ones((BATCH_SIZE * TRAINING_RATIO, 1), dtype=np.float32) | |||
| dummy_y = np.zeros((BATCH_SIZE, 1), dtype=np.float32) | |||
| positive_full_enable_train = np.ones((len(x_train), 1), dtype=np.float32) | |||
| positive_full_enable_val = np.ones((len(x_val), 1), dtype=np.float32) | |||
| positive_full_enable_test = np.ones((len(x_test), 1), dtype=np.float32) | |||
| best_validation_loss = None | |||
| callback = TrainValTensorBoard(write_graph=False) #TensorBoard(log_path) | |||
| callback.set_model(generator_model) | |||
| for epoch in range(N_EPOCH): | |||
| # np.random.shuffle(X_train) | |||
| print("Epoch: ", epoch) | |||
| print("Number of batches: ", int(y_train.shape[0] // BATCH_SIZE)) | |||
| discriminator_loss = [] | |||
| generator_loss = [] | |||
| minibatches_size = BATCH_SIZE * TRAINING_RATIO | |||
| shuffled_indexes = np.random.permutation(x_train.shape[0]) | |||
| shuffled_indexes_2 = np.random.permutation(x_train.shape[0]) | |||
| for i in range(int(y_train.shape[0] // (BATCH_SIZE * TRAINING_RATIO))): | |||
| batch_indexes = shuffled_indexes[i * minibatches_size:(i + 1) * minibatches_size] | |||
| batch_indexes_2 = shuffled_indexes_2[i * minibatches_size:(i + 1) * minibatches_size] | |||
| x = x_train[batch_indexes] | |||
| y = y_train[batch_indexes] | |||
| y_2 = y_train[batch_indexes_2] | |||
| x_2 = x_train[batch_indexes_2] | |||
| if epoch < N_WARM_UP: | |||
| for j in range(TRAINING_RATIO): | |||
| x_batch = x[j * BATCH_SIZE:(j + 1) * BATCH_SIZE] | |||
| y_batch = y[j * BATCH_SIZE:(j + 1) * BATCH_SIZE] | |||
| generator_loss.append(generator_model.train_on_batch([x_batch, positive_y], [y_batch, zero_y])) | |||
| discriminator_loss.append(0) | |||
| else: | |||
| for j in range(TRAINING_RATIO): | |||
| x_batch = x[j * BATCH_SIZE:(j + 1) * BATCH_SIZE] | |||
| y_batch = y[j * BATCH_SIZE:(j + 1) * BATCH_SIZE] | |||
| y_batch_2 = y_2[j * BATCH_SIZE:(j + 1) * BATCH_SIZE] | |||
| x_batch_2 = x_2[j * BATCH_SIZE:(j + 1) * BATCH_SIZE] | |||
| # noise = np.random.rand(BATCH_SIZE, 100).astype(np.float32) | |||
| noise = x_batch | |||
| discriminator_loss.append(discriminator_model.train_on_batch( | |||
| [y_batch_2, noise, x_batch_2], | |||
| [positive_y, negative_y, dummy_y])) | |||
| generator_loss.append(generator_model.train_on_batch([x, positive_full_y], [y, positive_full_y])) | |||
| predicted_y_train = generator_model.predict([x_train, positive_full_enable_train], batch_size=BATCH_SIZE)[0] | |||
| predicted_y_val = generator_model.predict([x_val, positive_full_enable_val], batch_size=BATCH_SIZE)[0] | |||
| train_loss = log_loss(rescale_back_labels(y_train), rescale_back_labels(predicted_y_train)) | |||
| val_loss = log_loss(y_val, rescale_back_labels(predicted_y_val)) | |||
| callback.on_epoch_end(epoch, | |||
| logs={'discriminator_loss': (np.sum(np.asarray(discriminator_loss)) if discriminator_loss else -1), | |||
| 'generator_loss': (np.sum(np.asarray(generator_loss)) if generator_loss else -1),#train_loss, | |||
| 'train_bce_loss': train_loss, | |||
| 'val_bce_loss': val_loss}) | |||
| print("train bce loss: {:.4f}, validation bce loss: {:.4f}, generator loss: {:.4f}, discriminator loss: {:.4f}".format( | |||
| train_loss, val_loss, | |||
| (np.sum(np.asarray(generator_loss)) if generator_loss else -1) / x_train.shape[0], | |||
| (np.sum(np.asarray(discriminator_loss)) if discriminator_loss else -1) / x_train.shape[0])) | |||
| if best_validation_loss is None or best_validation_loss > val_loss: | |||
| print('\nEpoch %05d: improved from %0.5f,' | |||
| ' saving model to %s and %s' | |||
| % (epoch + 1, val_loss, generator_model_path, discriminator_model_path)) | |||
| best_validation_loss = val_loss | |||
| generator_model.save(generator_model_path, overwrite=True) | |||
| discriminator_model.save(discriminator_model_path, overwrite=True) | |||
| def model(params, batch_size=64, nb_epoch=10000, is_train=True): | |||
| nb_classes = len(functions) | |||
| logging.info("Loading Data") | |||
| train, val, test, train_df, valid_df, test_df = load_data() | |||
| train_data, train_labels = train | |||
| val_data, val_labels = val | |||
| test_data, test_labels = test | |||
| train_labels = rescale_labels(train_labels) | |||
| logging.info("Training data size: %d" % len(train_data)) | |||
| logging.info("Validation data size: %d" % len(val_data)) | |||
| logging.info("Test data size: %d" % len(test_data)) | |||
| generator_model_path = DATA_ROOT + 'models/new_model_seq_' + FUNCTION + '.h5' | |||
| discriminator_model_path = DATA_ROOT + 'models/new_model_disc_seq_' + FUNCTION + '.h5' | |||
| logging.info('Starting training the model') | |||
| train_generator = DataGenerator(batch_size, nb_classes) | |||
| train_generator.fit(train_data, train_labels) | |||
| valid_generator = DataGenerator(batch_size, nb_classes) | |||
| valid_generator.fit(val_data, val_labels) | |||
| test_generator = DataGenerator(batch_size, nb_classes) | |||
| test_generator.fit(test_data, test_labels) | |||
| if is_train: | |||
| generator_model, discriminator_model = get_model(params, nb_classes, batch_size) | |||
| #Loading a pretrained model | |||
| #generator_model.load_weights(DATA_ROOT + 'models/trained_bce/new_model_seq_' + FUNCTION + '.h5') | |||
| train_wgan(generator_model, discriminator_model, batch_size=batch_size, epochs=nb_epoch, | |||
| x_train=train_data, y_train=train_labels , x_val=val_data, y_val=val_labels, x_test = test_data, y_test = test_labels, | |||
| generator_model_path=generator_model_path, | |||
| discriminator_model_path=discriminator_model_path) | |||
| logging.info('Loading best model') | |||
| model = load_model(generator_model_path, | |||
| custom_objects={'generator_recunstruction_loss_new': generator_recunstruction_loss_new, | |||
| 'wasserstein_loss': wasserstein_loss}) | |||
| logging.info('Predicting') | |||
| start_time = time.time() | |||
| preds = model.predict_generator(test_generator, steps=len(test_data) / batch_size)[0] | |||
| end_time = time.time() | |||
| logging.info('%f prediction time for %f protein sequences' % (start_time-end_time, len(test_data))) | |||
| preds = rescale_back_labels(preds) | |||
| #preds = propagate_score(preds, functions) | |||
| logging.info('Computing performance') | |||
| f, p, r, t, preds_max = compute_performance(preds, test_labels) | |||
| micro_roc_auc = compute_roc(preds, test_labels) | |||
| macro_roc_auc = compute_roc_macro(preds, test_labels) | |||
| mcc = compute_mcc(preds_max, test_labels) | |||
| aupr, _ = compute_aupr(preds, test_labels) | |||
| tpr_score, total = compute_tpr_score(preds_max, functions) | |||
| m_pr_max, m_rc_max, m_f1_max, M_pr_max, M_rc_max, M_f1_max = micro_macro_function_centric_f1(preds.T, test_labels.T) | |||
| function_centric_performance(functions, preds.T, test_labels.T, rescale_back_labels(train_labels).T, t) | |||
| logging.info('Protein centric macro Th, PR, RC, F1: \t %f %f %f %f' % (t, p, r, f)) | |||
| logging.info('Micro ROC AUC: \t %f ' % (micro_roc_auc, )) | |||
| logging.info('Macro ROC AUC: \t %f ' % (macro_roc_auc,)) | |||
| logging.info('MCC: \t %f ' % (mcc, )) | |||
| logging.info('AUPR: \t %f ' % (aupr, )) | |||
| logging.info('TPR_Score from total: \t %f \t %f ' % (tpr_score,total)) | |||
| logging.info('Function centric macro PR, RC, F1: \t %f %f %f' % (M_pr_max, M_rc_max, M_f1_max) ) | |||
| logging.info('Function centric micro PR, RC, F1: \t %f %f %f' % (m_pr_max, m_rc_max, m_f1_max) ) | |||
| def function_centric_performance(functions, preds, labels, labels_train, threshold): | |||
| results = [] | |||
| preds = np.round(preds, 2) | |||
| for i in range(preds.shape[0]): | |||
| predictions = (preds[i, :] > threshold).astype(np.int32) | |||
| tp = np.sum(predictions * labels[i, :]) | |||
| fp = np.sum(predictions) - tp | |||
| fn = np.sum(labels[i, :]) - tp | |||
| if tp > 0: | |||
| precision = tp / (1.0 * (tp + fp)) | |||
| recall = tp / (1.0 * (tp + fn)) | |||
| f = 2 * precision * recall / (precision + recall) | |||
| else: | |||
| if fp == 0 and fn == 0: | |||
| precision = 1 | |||
| recall = 1 | |||
| f = 1 | |||
| else: | |||
| precision = 0 | |||
| recall = 0 | |||
| f = 0 | |||
| f_max = f | |||
| p_max = precision | |||
| r_max = recall | |||
| num_prots_train = np.sum(labels_train[i, :]) | |||
| height = get_height(go, functions[i]) | |||
| results.append([functions[i], num_prots_train, height, f_max, p_max, r_max]) | |||
| results = pd.DataFrame(results) | |||
| results.to_csv('Con_GodGanSeq_results_' + FUNCTION + '.txt', sep='\t', index=False) | |||
| def micro_macro_function_centric_f1(preds, labels): | |||
| predictions = np.round(preds) | |||
| m_tp = 0 | |||
| m_fp = 0 | |||
| m_fn = 0 | |||
| M_pr = 0 | |||
| M_rc = 0 | |||
| for i in range(preds.shape[0]): | |||
| tp = np.sum(predictions[i, :] * labels[i, :]) | |||
| fp = np.sum(predictions[i, :]) - tp | |||
| fn = np.sum(labels[i, :]) - tp | |||
| m_tp += tp | |||
| m_fp += fp | |||
| m_fn += fn | |||
| if tp > 0: | |||
| pr = 1.0 * tp / (1.0 * (tp + fp)) | |||
| rc = 1.0 * tp / (1.0 * (tp + fn)) | |||
| else: | |||
| if fp == 0 and fn == 0: | |||
| pr = 1 | |||
| rc = 1 | |||
| else: | |||
| pr = 0 | |||
| rc = 0 | |||
| M_pr += pr | |||
| M_rc += rc | |||
| if m_tp > 0: | |||
| m_pr = 1.0 * m_tp / (1.0 * (m_tp + m_fp)) | |||
| m_rc = 1.0 * m_tp / (1.0 * (m_tp + m_fn)) | |||
| m_f1 = 2.0 * m_pr * m_rc / (m_pr + m_rc) | |||
| else: | |||
| if m_fp == 0 and m_fn == 0: | |||
| m_pr = 1 | |||
| m_rc = 1 | |||
| m_f1 = 1 | |||
| else: | |||
| m_pr = 0 | |||
| m_rc = 0 | |||
| m_f1 = 0 | |||
| M_pr /= preds.shape[0] | |||
| M_rc /= preds.shape[0] | |||
| if M_pr == 0 and M_rc == 0: | |||
| M_f1 = 0 | |||
| else: | |||
| M_f1 = 2.0 * M_pr * M_rc / (M_pr + M_rc) | |||
| return m_pr, m_rc, m_f1, M_pr, M_rc, M_f1 | |||
| def compute_roc(preds, labels): | |||
| # Compute ROC curve and ROC area for each class | |||
| fpr, tpr, _ = roc_curve(labels.flatten(), preds.flatten()) | |||
| roc_auc = auc(fpr, tpr) | |||
| return roc_auc | |||
| def compute_roc_macro(yhat_raw, y): | |||
| if yhat_raw.shape[0] <= 1: | |||
| return | |||
| fpr = {} | |||
| tpr = {} | |||
| roc_auc = {} | |||
| # get AUC for each label individually | |||
| relevant_labels = [] | |||
| auc_labels = {} | |||
| for i in range(y.shape[1]): | |||
| # only if there are true positives for this label | |||
| if y[:, i].sum() > 0: | |||
| fpr[i], tpr[i], _ = roc_curve(y[:, i], yhat_raw[:, i]) | |||
| if len(fpr[i]) > 1 and len(tpr[i]) > 1: | |||
| auc_score = auc(fpr[i], tpr[i]) | |||
| if not np.isnan(auc_score): | |||
| auc_labels["auc_%d" % i] = auc_score | |||
| relevant_labels.append(i) | |||
| # macro-AUC: just average the auc scores | |||
| aucs = [] | |||
| for i in relevant_labels: | |||
| aucs.append(auc_labels['auc_%d' % i]) | |||
| roc_auc_macro = np.mean(aucs) | |||
| return roc_auc_macro | |||
| def compute_aupr(preds, labels): | |||
| # Compute ROC curve and ROC area for each class | |||
| pr, rc, threshold =precision_recall_curve(labels.flatten(), preds.flatten()) | |||
| pr_auc = auc(rc, pr) | |||
| M_pr_auc = 0 | |||
| return pr_auc, M_pr_auc | |||
| def compute_mcc(preds, labels): | |||
| # Compute ROC curve and ROC area for each class | |||
| mcc = matthews_corrcoef(labels.flatten(), preds.flatten()) | |||
| return mcc | |||
| def compute_performance(preds, labels): | |||
| predicts = np.round(preds, 2) | |||
| f_max = 0 | |||
| p_max = 0 | |||
| r_max = 0 | |||
| t_max = 0 | |||
| for t in range(1, 100): | |||
| threshold = t / 100.0 | |||
| predictions = (predicts > threshold).astype(np.int32) | |||
| total = 0 | |||
| f = 0.0 | |||
| p = 0.0 | |||
| r = 0.0 | |||
| p_total = 0 | |||
| for i in range(labels.shape[0]): | |||
| tp = np.sum(predictions[i, :] * labels[i, :]) | |||
| fp = np.sum(predictions[i, :]) - tp | |||
| fn = np.sum(labels[i, :]) - tp | |||
| if tp == 0 and fp == 0 and fn == 0: | |||
| continue | |||
| total += 1 | |||
| if tp != 0: | |||
| p_total += 1 | |||
| precision = tp / (1.0 * (tp + fp)) | |||
| recall = tp / (1.0 * (tp + fn)) | |||
| p += precision | |||
| r += recall | |||
| if p_total == 0: | |||
| continue | |||
| r /= total | |||
| p /= p_total | |||
| if p + r > 0: | |||
| f = 2 * p * r / (p + r) | |||
| if f_max < f: | |||
| f_max = f | |||
| p_max = p | |||
| r_max = r | |||
| t_max = threshold | |||
| predictions_max = predictions | |||
| return f_max, p_max, r_max, t_max, predictions_max | |||
| def get_anchestors(go, go_id): | |||
| go_set = set() | |||
| q = deque() | |||
| q.append(go_id) | |||
| while(len(q) > 0): | |||
| g_id = q.popleft() | |||
| go_set.add(g_id) | |||
| for parent_id in go[g_id]['is_a']: | |||
| if parent_id in go: | |||
| q.append(parent_id) | |||
| return go_set | |||
| def compute_tpr_score(preds_max, functions): | |||
| my_dict = {} | |||
| for i in range(len(functions)): | |||
| my_dict[functions[i]] = i | |||
| total =0 | |||
| anc = np.zeros([len(functions), len(functions)]) | |||
| for i in range(len(functions)): | |||
| anchestors = list(get_anchestors(go, functions[i])) | |||
| for j in range(len(anchestors)): | |||
| if anchestors[j] in my_dict.keys(): | |||
| anc[i, my_dict[anchestors[j]]]=1 | |||
| total +=1 | |||
| tpr_score = 0 | |||
| for i in range(preds_max.shape[0]): | |||
| for j in range(preds_max.shape[1]): | |||
| if preds_max[i, j] == 1: | |||
| for k in range(anc.shape[1]): | |||
| if anc[j, k]==1: | |||
| if preds_max[i,k]!=1: | |||
| tpr_score = tpr_score +1 | |||
| return tpr_score/preds_max.shape[0], total | |||
| def propagate_score(preds, functions): | |||
| my_dict = {} | |||
| for i in range(len(functions)): | |||
| my_dict[functions[i]] = i | |||
| anc = np.zeros([len(functions), len(functions)]) | |||
| for i in range(len(functions)): | |||
| anchestors = list(get_anchestors(go, functions[i])) | |||
| for j in range(len(anchestors)): | |||
| if anchestors[j] in my_dict.keys(): | |||
| anc[i, my_dict[anchestors[j]]] = 1 | |||
| new_preds = np.array(preds) | |||
| for i in range(new_preds.shape[1]): | |||
| for k in range(anc.shape[1]): | |||
| if anc[i, k] == 1: | |||
| new_preds[new_preds[:, i] > new_preds[:, k], k] = new_preds[new_preds[:, i] > new_preds[:, k], i] | |||
| return new_preds | |||
| if __name__ == '__main__': | |||
| main() | |||
| @@ -0,0 +1,636 @@ | |||
| #!/usr/bin/env python | |||
| """ | |||
| """ | |||
| from __future__ import division | |||
| import logging | |||
| import sys | |||
| import time | |||
| import click as ck | |||
| import numpy as np | |||
| import pandas as pd | |||
| import tensorflow as tf | |||
| from keras import backend as K | |||
| from keras.layers import ( | |||
| Dense, Input, SpatialDropout1D, Conv1D, MaxPooling1D,GaussianNoise, | |||
| Flatten, Concatenate, Add, Maximum, Embedding, BatchNormalization, Activation, Dropout) | |||
| from keras.models import Sequential, Model, load_model | |||
| from sklearn.metrics import log_loss | |||
| from sklearn.metrics import roc_curve, auc, matthews_corrcoef, mean_squared_error | |||
| from keras.layers import Lambda, LeakyReLU | |||
| from sklearn.metrics import precision_recall_curve | |||
| from keras.callbacks import TensorBoard | |||
| from skimage import measure | |||
| from utils import DataGenerator | |||
| from conditional_wgan_wrapper_exp2 import WGAN_wrapper, wasserstein_loss, generator_recunstruction_loss_new | |||
| config = tf.ConfigProto() | |||
| config.gpu_options.allow_growth = True | |||
| sess = tf.Session(config=config) | |||
| K.set_session(sess) | |||
| logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO) | |||
| sys.setrecursionlimit(100000) | |||
| DATA_ROOT = 'Data/data2/' | |||
| MAXLEN = 258 | |||
| @ck.command() | |||
| @ck.option( | |||
| '--function', | |||
| default='cc', | |||
| help='Ontology id (mf, bp, cc)') | |||
| @ck.option( | |||
| '--device', | |||
| default='gpu:0', | |||
| help='GPU or CPU device id') | |||
| @ck.option('--train',default =True, is_flag=True) | |||
| def main(function, device, train): | |||
| global FUNCTION | |||
| FUNCTION = function | |||
| with tf.device('/' + device): | |||
| params = { | |||
| 'fc_output': 1024, | |||
| 'embedding_dims': 128, | |||
| 'embedding_dropout': 0.2, | |||
| 'nb_conv': 1, | |||
| 'nb_dense': 1, | |||
| 'filter_length': 128, | |||
| 'nb_filter': 32, | |||
| 'pool_length': 64, | |||
| 'stride': 32 | |||
| } | |||
| model(params, is_train=train) | |||
| def load_data2(): | |||
| all_data_x_fn = 'data2/all_data_X.csv' | |||
| all_data_x = pd.read_csv(all_data_x_fn, sep='\t', header=0, index_col=0) | |||
| all_proteins_train = [p.replace('"', '') for p in all_data_x.index] | |||
| all_data_x.index = all_proteins_train | |||
| all_data_y_fn = 'data2/all_data_Y.csv' | |||
| all_data_y = pd.read_csv(all_data_y_fn, sep='\t', header=0, index_col=0) | |||
| branch = pd.read_csv('data2/'+FUNCTION +'_branches.txt', sep='\t', header=0, index_col=0) | |||
| all_x = all_data_x.values | |||
| branches = [p for p in branch.index.tolist() if p in all_data_y.columns.tolist()] | |||
| branches = list(dict.fromkeys(branches)) | |||
| global mt_funcs | |||
| mt_funcs = branches | |||
| t= pd.DataFrame(all_data_y, columns=branches) | |||
| all_y = t.values | |||
| number_of_test = int(np.ceil(0.4 * len(all_x))) | |||
| np.random.seed(0) | |||
| index = np.random.rand(1,number_of_test) | |||
| index_test = [int(p) for p in np.ceil(index*(len(all_x)-1))[0]] | |||
| index_train = [p for p in range(len(all_x)) if p not in index_test] | |||
| train_data = all_x[index_train, : ] | |||
| test_x = all_x[index_test, : ] | |||
| train_labels = all_y[index_train, : ] | |||
| test_y = all_y[index_test, :] | |||
| number_of_vals = int(np.ceil(0.5 * len(test_x))) | |||
| np.random.seed(1) | |||
| index = np.random.rand(1, number_of_vals) | |||
| index_vals = [int(p) for p in np.ceil(index * (len(test_x) - 1))[0]] | |||
| index_test = [p for p in range(len(test_x)) if p not in index_vals] | |||
| val_data = test_x[index_vals, :] | |||
| test_data = test_x[index_test, :] | |||
| val_labels = test_y[index_vals, :] | |||
| test_labels = test_y[index_test, :] | |||
| print(train_labels.shape) | |||
| return train_data, train_labels, test_data, test_labels, val_data, val_labels | |||
| def get_feature_model(params): | |||
| embedding_dims = params['embedding_dims'] | |||
| max_features = 8001 | |||
| model = Sequential() | |||
| model.add(Embedding( | |||
| max_features, | |||
| embedding_dims, | |||
| input_length=MAXLEN)) | |||
| model.add(SpatialDropout1D(0.4)) | |||
| for i in range(params['nb_conv']): | |||
| model.add(Conv1D( | |||
| padding="valid", | |||
| strides=1, | |||
| filters=params['nb_filter'], | |||
| kernel_size=params['filter_length'])) | |||
| model.add(LeakyReLU()) | |||
| model.add(MaxPooling1D(strides=params['stride'], pool_size=params['pool_length'])) | |||
| model.add(Flatten()) | |||
| return model | |||
| def get_generator(params, n_classes): | |||
| inputs = Input(shape=(MAXLEN,), dtype='float32', name='input1') | |||
| net0 = Dense(200)(inputs) | |||
| #net0 = BatchNormalization()(net0) | |||
| net0 = LeakyReLU()(net0) | |||
| net0 = Dense(200)(net0) | |||
| #net0 = BatchNormalization()(net0) | |||
| net0 = LeakyReLU()(net0) | |||
| #net0 = Dense(200, activation='relu')(inputs) | |||
| net0 = Dense(200)(net0) | |||
| #net0 = BatchNormalization()(net0) | |||
| net = LeakyReLU()(net0) | |||
| output = Dense(n_classes, activation='tanh')(net) | |||
| model = Model(inputs=inputs, outputs=output) | |||
| return model | |||
| def get_discriminator(params, n_classes, dropout_rate=0.5): | |||
| inputs = Input(shape=(n_classes, )) | |||
| n_inputs = GaussianNoise(0.1)(inputs) | |||
| inputs2 = Input(shape =(MAXLEN,), dtype ='int32', name='d_input2') | |||
| x2 = Embedding(8001, 128, input_length=MAXLEN)(inputs2) | |||
| x2 = Conv1D(filters =1 , kernel_size= 1, padding = 'valid', strides=1)(x2) | |||
| x2 = LeakyReLU()(x2) | |||
| x2 = Lambda(lambda x: K.squeeze(x, 2))(x2) | |||
| #for i in range(params['nb_conv']): | |||
| # x2 = Conv1D ( activation="relu", padding="valid", strides=1, filters=params['nb_filter'],kernel_size=params['filter_length'])(x2) | |||
| #x2 =MaxPooling1D(strides=params['stride'], pool_size=params['pool_length'])(x2) | |||
| #x2 = Flatten()(x2) | |||
| size = 40 | |||
| x = n_inputs | |||
| x = Dropout(dropout_rate)(x) | |||
| x = Dense(size)(x) | |||
| x = BatchNormalization()(x) | |||
| x = LeakyReLU()(x) | |||
| size = 40 | |||
| x2 = Dropout(dropout_rate)(x2) | |||
| x2 = Dense(size)(x2) | |||
| x2 = BatchNormalization()(x2) | |||
| x2 = LeakyReLU()(x2) | |||
| x = Concatenate(axis =1 , name = 'merged2')([x, x2]) | |||
| layer_sizes = [30, 30, 30, 30, 30, 30] | |||
| for size in layer_sizes: | |||
| x = Dropout(dropout_rate)(x) | |||
| x = Dense(size)(x) | |||
| x = BatchNormalization()(x) | |||
| x = LeakyReLU()(x) | |||
| outputs = Dense(1)(x) | |||
| model = Model(inputs = [inputs ,inputs2], outputs=outputs, name='Discriminator') | |||
| return model | |||
| class TrainValTensorBoard(TensorBoard): | |||
| def __init__(self, log_dir='./t_logs', **kwargs): | |||
| # Make the original `TensorBoard` log to a subdirectory 'training' | |||
| training_log_dir = log_dir | |||
| super(TrainValTensorBoard, self).__init__(training_log_dir, **kwargs) | |||
| # Log the validation metrics to a separate subdirectory | |||
| self.val_log_dir = './v_logs' | |||
| self.dis_log_dir = './d_logs' | |||
| self.gen_log_dir = './g_logs' | |||
| def set_model(self, model): | |||
| # Setup writer for validation metrics | |||
| self.val_writer = tf.summary.FileWriter(self.val_log_dir) | |||
| self.dis_writer = tf.summary.FileWriter(self.dis_log_dir) | |||
| self.gen_writer = tf.summary.FileWriter(self.gen_log_dir) | |||
| super(TrainValTensorBoard, self).set_model(model) | |||
| def on_epoch_end(self, epoch, logs=None): | |||
| # Pop the validation logs and handle them separately with | |||
| # `self.val_writer`. Also rename the keys so that they can | |||
| # be plotted on the same figure with the training metrics | |||
| logs = logs or {} | |||
| val_logs = {k.replace('val_', 'v_'): v for k, v in logs.items() if k.startswith('val_')} | |||
| for name, value in val_logs.items(): | |||
| summary = tf.Summary() | |||
| summary_value = summary.value.add() | |||
| summary_value.simple_value = value.item() | |||
| summary_value.tag = name | |||
| self.val_writer.add_summary(summary, epoch) | |||
| self.val_writer.flush() | |||
| logs = logs or {} | |||
| dis_logs = {k.replace('discriminator_', 'd_'): v for k, v in logs.items() if k.startswith('discriminator_')} | |||
| for name, value in dis_logs.items(): | |||
| summary = tf.Summary() | |||
| summary_value = summary.value.add() | |||
| summary_value.simple_value = value.item() | |||
| summary_value.tag = name | |||
| self.dis_writer.add_summary(summary, epoch) | |||
| self.dis_writer.flush() | |||
| logs = logs or {} | |||
| gen_logs = {k.replace('generator_', 'g_'): v for k, v in logs.items() if k.startswith('generator_')} | |||
| for name, value in gen_logs.items(): | |||
| summary = tf.Summary() | |||
| summary_value = summary.value.add() | |||
| summary_value.simple_value = value.item() | |||
| summary_value.tag = name | |||
| self.gen_writer.add_summary(summary, epoch) | |||
| self.gen_writer.flush() | |||
| # Pass the remaining logs to `TensorBoard.on_epoch_end` | |||
| t_logs = {k: v for k, v in logs.items() if not k.startswith('val_')} | |||
| tr_logs = {k: v for k, v in t_logs.items() if not k.startswith('discriminator_')} | |||
| tra_logs = {k: v for k, v in tr_logs.items() if not k.startswith('generator_')} | |||
| super(TrainValTensorBoard, self).on_epoch_end(epoch, tra_logs) | |||
| def on_train_end(self, logs=None): | |||
| super(TrainValTensorBoard, self).on_train_end(logs) | |||
| self.val_writer.close() | |||
| self.gen_writer.close() | |||
| self.dis_writer.close() | |||
| def rescale_labels(labels): | |||
| rescaled_labels = 2*(labels - 0.5) | |||
| return rescaled_labels | |||
| def rescale_back_labels(labels): | |||
| rescaled_back_labels =(labels/2)+0.5 | |||
| return rescaled_back_labels | |||
| def get_model(params, nb_classes, batch_size, GRADIENT_PENALTY_WEIGHT=10): | |||
| generator = get_generator(params, nb_classes) | |||
| discriminator = get_discriminator(params, nb_classes) | |||
| generator_model, discriminator_model = \ | |||
| WGAN_wrapper(generator=generator, | |||
| discriminator=discriminator, | |||
| generator_input_shape=(MAXLEN,), | |||
| discriminator_input_shape=(nb_classes,), | |||
| discriminator_input_shape2 = (MAXLEN, ), | |||
| batch_size=batch_size, | |||
| gradient_penalty_weight=GRADIENT_PENALTY_WEIGHT) | |||
| logging.info('Compilation finished') | |||
| return generator_model, discriminator_model | |||
| def train_wgan(generator_model, discriminator_model, batch_size, epochs, | |||
| x_train, y_train, x_val, y_val, generator_model_path, discriminator_model_path, | |||
| TRAINING_RATIO=10, N_WARM_UP=0): | |||
| BATCH_SIZE = batch_size | |||
| N_EPOCH = epochs | |||
| positive_y = np.ones((batch_size, 1), dtype=np.float32) | |||
| zero_y = positive_y * 0 | |||
| negative_y = -positive_y | |||
| positive_full_y = np.ones((BATCH_SIZE * TRAINING_RATIO, 1), dtype=np.float32) | |||
| dummy_y = np.zeros((BATCH_SIZE, 1), dtype=np.float32) | |||
| positive_full_enable_train = np.ones((len(x_train), 1), dtype = np.float32 ) | |||
| positive_full_enable_val = np.ones((len(x_val), 1), dtype =np.float32 ) | |||
| #positive_enable_train = np.ones((1, batch_size),dtype = np.float32 ) | |||
| #positive_full_train_enable = np.ones((1,BATCH_SIZE * TRAINING_RATIO ), dtype=np.float32 ) | |||
| best_validation_loss = None | |||
| callback = TrainValTensorBoard(write_graph=False) #TensorBoard(log_path) | |||
| callback.set_model(generator_model) | |||
| for epoch in range(N_EPOCH): | |||
| # np.random.shuffle(X_train) | |||
| print("Epoch: ", epoch) | |||
| print("Number of batches: ", int(y_train.shape[0] // BATCH_SIZE)) | |||
| discriminator_loss = [] | |||
| generator_loss = [] | |||
| minibatches_size = BATCH_SIZE * TRAINING_RATIO | |||
| shuffled_indexes = np.random.permutation(x_train.shape[0]) | |||
| shuffled_indexes_2 = np.random.permutation(x_train.shape[0]) | |||
| for i in range(int(y_train.shape[0] // (BATCH_SIZE * TRAINING_RATIO))): | |||
| batch_indexes = shuffled_indexes[i * minibatches_size:(i + 1) * minibatches_size] | |||
| batch_indexes_2 = shuffled_indexes_2[i * minibatches_size:(i + 1) * minibatches_size] | |||
| x = x_train[batch_indexes] | |||
| y = y_train[batch_indexes] | |||
| y_2 = y_train[batch_indexes_2] | |||
| x_2 = x_train[batch_indexes_2] | |||
| if epoch < N_WARM_UP: | |||
| for j in range(TRAINING_RATIO): | |||
| x_batch = x[j * BATCH_SIZE:(j + 1) * BATCH_SIZE] | |||
| y_batch = y[j * BATCH_SIZE:(j + 1) * BATCH_SIZE] | |||
| logg = generator_model.train_on_batch([x_batch, positive_y], [y_batch, zero_y]) | |||
| generator_loss.append(logg) | |||
| discriminator_loss.append(0) | |||
| else: | |||
| for j in range(TRAINING_RATIO): | |||
| x_batch = x[j * BATCH_SIZE:(j + 1) * BATCH_SIZE] | |||
| y_batch_2 = y_2[j * BATCH_SIZE:(j + 1) * BATCH_SIZE] | |||
| x_batch_2 = x_2[j * BATCH_SIZE:(j + 1) * BATCH_SIZE] | |||
| # noise = np.random.rand(BATCH_SIZE, 100).astype(np.float32) | |||
| noise = x_batch | |||
| discriminator_loss.append(discriminator_model.train_on_batch( | |||
| [y_batch_2, noise, x_batch_2], | |||
| [positive_y, negative_y, dummy_y])) | |||
| logg = generator_model.train_on_batch([x,positive_full_y], [y, positive_full_y]) | |||
| generator_loss.append(logg) | |||
| predicted_y_train, _ = generator_model.predict([x_train, positive_full_enable_train], batch_size=BATCH_SIZE) | |||
| predicted_y_val, _ = generator_model.predict([x_val, positive_full_enable_val], batch_size=BATCH_SIZE) | |||
| train_loss = log_loss(rescale_back_labels(y_train), rescale_back_labels(predicted_y_train)) | |||
| val_loss = log_loss(y_val, rescale_back_labels(predicted_y_val)) | |||
| callback.on_epoch_end(epoch, logs={'discriminator_loss': (np.sum(np.asarray(discriminator_loss)) if discriminator_loss else -1), | |||
| 'generator_loss': (np.sum(np.asarray(generator_loss)) if generator_loss else -1) ,'train_loss':train_loss ,'val_loss': val_loss}) | |||
| print("train loss: {:.4f}, validation loss: {:.4f}, validation auc: {:.4f}, generator loss: {:.4f}, discriminator loss: {:.4f}".format( | |||
| train_loss, val_loss, compute_roc(rescale_back_labels(predicted_y_val), y_val), | |||
| (np.sum(np.asarray(generator_loss)) if generator_loss else -1) / x_train.shape[0], | |||
| (np.sum(np.asarray(discriminator_loss)) if discriminator_loss else -1) / x_train.shape[0])) | |||
| if best_validation_loss is None or best_validation_loss > val_loss: | |||
| print('\nEpoch %05d: improved from %0.5f,' | |||
| ' saving model to %s and %s' | |||
| % (epoch + 1, val_loss, generator_model_path, discriminator_model_path)) | |||
| best_validation_loss = val_loss | |||
| generator_model.save(generator_model_path, overwrite=True) | |||
| discriminator_model.save(discriminator_model_path, overwrite=True) | |||
| def model(params, batch_size=128, nb_epoch=50, is_train=True): | |||
| start_time = time.time() | |||
| logging.info("Loading Data") | |||
| train_data, train_labels, test_data, test_labels, val_data, val_labels = load_data2() | |||
| train_labels = rescale_labels(train_labels) | |||
| nb_classes = train_labels.shape[1] | |||
| logging.info("Data loaded in %d sec" % (time.time() - start_time)) | |||
| logging.info("Training data size: %d" % len(train_data)) | |||
| logging.info("Validation data size: %d" % len(val_data)) | |||
| logging.info("Test data size: %d" % len(test_data)) | |||
| generator_model_path = DATA_ROOT + 'models/new_model_seq_mt_10000_1_loaded_pretrained_new_' + FUNCTION + '.h5' #_loaded_pretrained_new | |||
| discriminator_model_path = DATA_ROOT + 'models/new_model_disc_seq_mt_' + FUNCTION + '.h5' | |||
| logging.info('Starting training the model') | |||
| train_generator = DataGenerator(batch_size, nb_classes) | |||
| train_generator.fit(train_data, train_labels) | |||
| valid_generator = DataGenerator(batch_size, nb_classes) | |||
| valid_generator.fit(val_data, val_labels) | |||
| test_generator = DataGenerator(batch_size, nb_classes) | |||
| test_generator.fit(test_data, test_labels) | |||
| if is_train: | |||
| generator_model, discriminator_model = get_model(params, nb_classes, batch_size) | |||
| generator_model.load_weights(DATA_ROOT + 'models/new_model_seq_mt_1_0_' + FUNCTION + '.h5') | |||
| train_wgan(generator_model, discriminator_model, batch_size=batch_size, epochs=nb_epoch, | |||
| x_train=train_data, y_train=train_labels, x_val=val_data, y_val=val_labels, | |||
| generator_model_path=generator_model_path, | |||
| discriminator_model_path=discriminator_model_path) | |||
| logging.info('Loading best model') | |||
| model = load_model(generator_model_path, | |||
| custom_objects={'generator_recunstruction_loss_new': generator_recunstruction_loss_new, | |||
| 'wasserstein_loss': wasserstein_loss}) | |||
| logging.info('Predicting') | |||
| preds = model.predict_generator(test_generator, steps=len(test_data) / batch_size)[0] | |||
| preds = rescale_back_labels(preds) | |||
| logging.info('Computing performance') | |||
| f, p, r, t, preds_max = compute_performance(preds, test_labels) | |||
| roc_auc = compute_roc(preds, test_labels) | |||
| mcc = compute_mcc(preds_max, test_labels) | |||
| aupr , _ = compute_aupr(preds, test_labels) | |||
| m_pr_max, m_rc_max, m_f1_max, M_pr_max, M_rc_max, M_f1_max = micro_macro_function_centric_f1(preds.T, test_labels.T) | |||
| tpr, total_rule = tpr_score(preds_max) | |||
| logging.info('Protein centric macro Th, PR, RC, F1: \t %f %f %f %f' % (t, p, r, f)) | |||
| logging.info('ROC AUC: \t %f ' % (roc_auc, )) | |||
| logging.info('MCC: \t %f ' % (mcc, )) | |||
| logging.info('AUPR: \t %f ' % (aupr, )) | |||
| logging.info('TPR from total rule: \t %f \t %f' % (tpr, total_rule)) | |||
| logging.info('Function centric macro PR, RC, F1: \t %f %f %f' % (M_pr_max, M_rc_max, M_f1_max) ) | |||
| logging.info('Function centric micro PR, RC, F1: \t %f %f %f' % (m_pr_max, m_rc_max, m_f1_max) ) | |||
| function_centric_performance(preds.T, test_labels.T, rescale_back_labels(train_labels).T, t) | |||
| def tpr_score(preds): | |||
| hierarchy_f = open("data2/mtdnn_data_svmlight/" + FUNCTION + "_hierarchy.txt", "r") | |||
| hiers = hierarchy_f.readlines() | |||
| hiers.pop(0) | |||
| tree = [] | |||
| for i in range(len(hiers)): | |||
| sample = [] | |||
| str_sample = hiers[i].split() | |||
| for j in range(1, len(str_sample)): | |||
| sample.append(int(str_sample[j])) | |||
| tree.append(sample) | |||
| my_dict = {} | |||
| for i in range(preds.shape[0]): | |||
| my_dict[str(i)] = [] | |||
| total_rule = 0 | |||
| for i in range(len(tree)): | |||
| for j in range(len(tree[i])): | |||
| my_dict[str(tree[i][j]-1)].append(i) | |||
| total_rule +=1 | |||
| tpr = 0 | |||
| for i in range(preds.shape[0]): | |||
| for j in range(preds.shape[1]): | |||
| if preds[i,j] ==1: | |||
| parents = my_dict[str(j)] | |||
| for k in range(len(parents)): | |||
| if preds[i, parents[k]] == 0: | |||
| tpr = tpr +1 | |||
| return tpr/preds.shape[0], total_rule | |||
| def function_centric_performance(preds, labels, labels_train, threshold): | |||
| results = [] | |||
| av_f = 0 | |||
| preds = np.round(preds, 2) | |||
| for i in range(preds.shape[0]): | |||
| predictions = (preds[i, :] > threshold).astype(np.int32) | |||
| tp = np.sum(predictions * labels[i, :]) | |||
| fp = np.sum(predictions) - tp | |||
| fn = np.sum(labels[i, :]) - tp | |||
| if tp > 0: | |||
| precision = tp / (1.0 * (tp + fp)) | |||
| recall = tp / (1.0 * (tp + fn)) | |||
| f = 2 * precision * recall / (precision + recall) | |||
| else: | |||
| if fp == 0 and fn == 0: | |||
| precision = 1 | |||
| recall = 1 | |||
| f = 1 | |||
| else: | |||
| precision = 0 | |||
| recall = 0 | |||
| f = 0 | |||
| f_max = f | |||
| av_f += f | |||
| p_max = precision | |||
| r_max = recall | |||
| num_prots_train = np.sum(labels_train[i, :]) | |||
| results.append([mt_funcs[i], num_prots_train, f_max, p_max, r_max]) | |||
| print(av_f/preds.shape[0]) | |||
| results = pd.DataFrame(results) | |||
| results.to_csv('MT_1_0_results' + FUNCTION + '.txt', sep='\t', index=False) | |||
| predictions = (preds > threshold).astype(np.int32) | |||
| hitmap_test = (np.dot(predictions, predictions.T)> 0).astype(np.int32) | |||
| hitmap_train =(np.dot(labels_train, labels_train.T) > 0).astype(np.int32) | |||
| logging.info('Heatmap scores:\t MSE:%f \t SSIM %f ' % (mean_squared_error(hitmap_test, hitmap_train), measure.compare_ssim(hitmap_test, hitmap_train))) | |||
| #hitmap_test = pd.DataFrame(hitmap_test) | |||
| #hitmap_test.to_csv('hitmap_1_1_' + FUNCTION + '.txt', sep='\t', index=False) | |||
| #hitmap_train = pd.DataFrame(hitmap_train) | |||
| #hitmap_train.to_csv('hitmap_train_' + FUNCTION + '.txt', sep='\t', index=False) | |||
| #co_occure = np.zeros(shape=[predictions.shape[0], predictions.shape[0]]) | |||
| #mutual_ex = np.zeros(shape=[predictions.shape[0], predictions.shape[0]]) | |||
| #check_co_occur_test = np.zeros(shape=[predictions.shape[0], predictions.shape[0]]) | |||
| #for i in range(predictions.shape[0]): | |||
| # for j in range(predictions.shape[0]): | |||
| # temp = np.logical_xor(labels_train[i,:], labels_train[j,:]) | |||
| # if np.sum(temp) ==0: | |||
| # co_occure[i, j] = 1 | |||
| # temp = np.dot(labels_train[i,:], labels_train[j,:]) | |||
| # if np.sum(temp) ==0: | |||
| # mutual_ex[i,j] = 1 | |||
| # check_co_occur_test[i, j] = (sum(np.logical_xor(labels_train[i,:], labels_train[j,:])) >0).astype(np.int32) | |||
| #logging.info('Mutual_exclucivity_distorsion:\t %f ' % (sum(sum(np.multiply(mutual_ex, hitmap_test))))) | |||
| #logging.info('Co_occurance_distorsion:\t %f ' % (sum(sum(np.multiply(co_occure, check_co_occur_test))))) | |||
| def micro_macro_function_centric_f1(preds, labels): | |||
| predictions = np.round(preds) | |||
| m_tp = 0 | |||
| m_fp = 0 | |||
| m_fn = 0 | |||
| M_pr = 0 | |||
| M_rc = 0 | |||
| for i in range(preds.shape[0]): | |||
| tp = np.sum(predictions[i, :] * labels[i, :]) | |||
| fp = np.sum(predictions[i, :]) - tp | |||
| fn = np.sum(labels[i, :]) - tp | |||
| m_tp += tp | |||
| m_fp += fp | |||
| m_fn += fn | |||
| if tp > 0: | |||
| pr = 1.0 * tp / (1.0 * (tp + fp)) | |||
| rc = 1.0 * tp / (1.0 * (tp + fn)) | |||
| else: | |||
| if fp == 0 and fn == 0: | |||
| pr = 1 | |||
| rc = 1 | |||
| else: | |||
| pr = 0 | |||
| rc = 0 | |||
| M_pr += pr | |||
| M_rc += rc | |||
| if m_tp > 0: | |||
| m_pr = 1.0 * m_tp / (1.0 * (m_tp + m_fp)) | |||
| m_rc = 1.0 * m_tp / (1.0 * (m_tp + m_fn)) | |||
| m_f1 = 2.0 * m_pr * m_rc / (m_pr + m_rc) | |||
| else: | |||
| if m_fp == 0 and m_fn == 0: | |||
| m_pr = 1 | |||
| m_rc = 1 | |||
| m_f1 = 1 | |||
| else: | |||
| m_pr = 0 | |||
| m_rc = 0 | |||
| m_f1 = 0 | |||
| M_pr /= preds.shape[0] | |||
| M_rc /= preds.shape[0] | |||
| if M_pr == 0 and M_rc == 0: | |||
| M_f1 = 0 | |||
| else: | |||
| M_f1 = 2.0 * M_pr * M_rc / (M_pr + M_rc) | |||
| return m_pr, m_rc, m_f1, M_pr, M_rc, M_f1 | |||
| def compute_roc(preds, labels): | |||
| # Compute ROC curve and ROC area for each class | |||
| fpr, tpr, _ = roc_curve(labels.flatten(), preds.flatten()) | |||
| roc_auc = auc(fpr, tpr) | |||
| return roc_auc | |||
| def compute_aupr(preds, labels): | |||
| # Compute ROC curve and ROC area for each class | |||
| pr, rc, threshold =precision_recall_curve(labels.flatten(), preds.flatten()) | |||
| pr_auc = auc(rc, pr) | |||
| #pr, rc, threshold =precision_recall_curve(labels.flatten(), preds.flatten(),average ='macro' ) | |||
| M_pr_auc = 0 | |||
| return pr_auc, M_pr_auc | |||
| def compute_mcc(preds, labels): | |||
| # Compute ROC curve and ROC area for each class | |||
| mcc = matthews_corrcoef(labels.flatten(), preds.flatten()) | |||
| return mcc | |||
| def compute_performance(preds, labels): | |||
| preds = np.round(preds, 2) | |||
| f_max = 0 | |||
| p_max = 0 | |||
| r_max = 0 | |||
| t_max = 0 | |||
| for t in range(1, 100): | |||
| threshold = t / 100.0 | |||
| predictions = (preds > threshold).astype(np.int32) | |||
| total = 0 | |||
| f = 0.0 | |||
| p = 0.0 | |||
| r = 0.0 | |||
| p_total = 0 | |||
| for i in range(labels.shape[0]): | |||
| tp = np.sum(predictions[i, :] * labels[i, :]) | |||
| fp = np.sum(predictions[i, :]) - tp | |||
| fn = np.sum(labels[i, :]) - tp | |||
| if tp == 0 and fp == 0 and fn == 0: | |||
| continue | |||
| total += 1 | |||
| if tp != 0: | |||
| p_total += 1 | |||
| precision = tp / (1.0 * (tp + fp)) | |||
| recall = tp / (1.0 * (tp + fn)) | |||
| p += precision | |||
| r += recall | |||
| if p_total == 0: | |||
| continue | |||
| r /= total | |||
| p /= p_total | |||
| if p + r > 0: | |||
| f = 2 * p * r / (p + r) | |||
| if f_max < f: | |||
| f_max = f | |||
| p_max = p | |||
| r_max = r | |||
| t_max = threshold | |||
| predictions_max = predictions | |||
| return f_max, p_max, r_max, t_max, predictions_max | |||
| if __name__ == '__main__': | |||
| main() | |||
| @@ -1,3 +1,6 @@ | |||
| # These functions are used to manipulate first dataset from DeepGO and taken from https://github.com/bio-ontology-research-group/deepgo. | |||
| from collections import deque | |||
| from keras import backend as K | |||
| from keras.callbacks import ModelCheckpoint | |||
| @@ -5,6 +8,8 @@ import warnings | |||
| import pandas as pd | |||
| from xml.etree import ElementTree as ET | |||
| BIOLOGICAL_PROCESS = 'GO:0008150' | |||
| MOLECULAR_FUNCTION = 'GO:0003674' | |||
| CELLULAR_COMPONENT = 'GO:0005575' | |||