|
|
@@ -1,917 +0,0 @@ |
|
|
|
#!/usr/bin/env python |
|
|
|
|
|
|
|
""" |
|
|
|
|
|
|
|
""" |
|
|
|
from __future__ import division |
|
|
|
|
|
|
|
import logging |
|
|
|
import sys |
|
|
|
import time |
|
|
|
from collections import deque |
|
|
|
from multiprocessing import Pool |
|
|
|
|
|
|
|
import click as ck |
|
|
|
import numpy as np |
|
|
|
import pandas as pd |
|
|
|
import tensorflow as tf |
|
|
|
from keras import backend as K |
|
|
|
from keras.callbacks import EarlyStopping, ModelCheckpoint |
|
|
|
from keras.layers import ( |
|
|
|
Dense, Input, SpatialDropout1D, Conv1D, MaxPooling1D, |
|
|
|
Flatten, Concatenate, Add, Maximum, Embedding, BatchNormalization, Activation, Dropout) |
|
|
|
from keras.losses import binary_crossentropy |
|
|
|
from keras.models import Sequential, Model, load_model |
|
|
|
from keras.preprocessing import sequence |
|
|
|
from scipy.spatial import distance |
|
|
|
from sklearn.metrics import log_loss |
|
|
|
from sklearn.metrics import roc_curve, auc, matthews_corrcoef |
|
|
|
from keras.layers import Lambda |
|
|
|
from sklearn.metrics import precision_recall_curve |
|
|
|
|
|
|
|
from utils import ( |
|
|
|
get_gene_ontology, |
|
|
|
get_go_set, |
|
|
|
get_anchestors, |
|
|
|
get_parents, |
|
|
|
DataGenerator, |
|
|
|
FUNC_DICT, |
|
|
|
get_height, |
|
|
|
get_ipro) |
|
|
|
from conditional_wgan_wrapper_post import WGAN_wrapper, wasserstein_loss, generator_recunstruction_loss_new |
|
|
|
|
|
|
|
config = tf.ConfigProto() |
|
|
|
config.gpu_options.allow_growth = True |
|
|
|
sess = tf.Session(config=config) |
|
|
|
K.set_session(sess) |
|
|
|
|
|
|
|
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO) |
|
|
|
sys.setrecursionlimit(100000) |
|
|
|
|
|
|
|
DATA_ROOT = 'data/swiss/' |
|
|
|
MAXLEN = 258 #1000 |
|
|
|
REPLEN = 256 |
|
|
|
ind = 0 |
|
|
|
|
|
|
|
|
|
|
|
@ck.command() |
|
|
|
@ck.option( |
|
|
|
'--function', |
|
|
|
default='bp', |
|
|
|
help='Ontology id (mf, bp, cc)') |
|
|
|
@ck.option( |
|
|
|
'--device', |
|
|
|
default='gpu:0', |
|
|
|
help='GPU or CPU device id') |
|
|
|
@ck.option( |
|
|
|
'--org', |
|
|
|
default= None, |
|
|
|
help='Organism id for filtering test set') |
|
|
|
@ck.option('--train',default = True, is_flag=True) |
|
|
|
@ck.option('--param', default=0, help='Param index 0-7') |
|
|
|
def main(function, device, org, train, param): |
|
|
|
global FUNCTION |
|
|
|
FUNCTION = function |
|
|
|
global GO_ID |
|
|
|
GO_ID = FUNC_DICT[FUNCTION] |
|
|
|
global go |
|
|
|
go = get_gene_ontology('go.obo') |
|
|
|
global ORG |
|
|
|
ORG = org |
|
|
|
func_df = pd.read_pickle(DATA_ROOT + FUNCTION + '.pkl') |
|
|
|
global functions |
|
|
|
functions = func_df['functions'].values |
|
|
|
global func_set |
|
|
|
func_set = set(functions) |
|
|
|
global all_functions |
|
|
|
all_functions = get_go_set(go, GO_ID) |
|
|
|
logging.info('Functions: %s %d' % (FUNCTION, len(functions))) |
|
|
|
if ORG is not None: |
|
|
|
logging.info('Organism %s' % ORG) |
|
|
|
global go_indexes |
|
|
|
go_indexes = dict() |
|
|
|
for ind, go_id in enumerate(functions): |
|
|
|
go_indexes[go_id] = ind |
|
|
|
global node_names |
|
|
|
node_names = set() |
|
|
|
with tf.device('/' + device): |
|
|
|
params = { |
|
|
|
'fc_output': 1024, |
|
|
|
'learning_rate': 0.001, |
|
|
|
'embedding_dims': 128, |
|
|
|
'embedding_dropout': 0.2, |
|
|
|
'nb_conv': 1, |
|
|
|
'nb_dense': 1, |
|
|
|
'filter_length': 128, |
|
|
|
'nb_filter': 32, |
|
|
|
'pool_length': 64, |
|
|
|
'stride': 32 |
|
|
|
} |
|
|
|
model(params, is_train=train) |
|
|
|
#dims = [64, 128, 256, 512] |
|
|
|
#nb_filters = [16, 32, 64, 128] |
|
|
|
#nb_convs = [1, 2, 3, 4] |
|
|
|
#nb_dense = [1, 2, 3, 4] |
|
|
|
#for i in range(param * 32, param * 32 + 32): |
|
|
|
# dim = i % 4 |
|
|
|
# i = i / 4 |
|
|
|
# nb_fil = i % 4 |
|
|
|
# i /= 4 |
|
|
|
# conv = i % 4 |
|
|
|
# i /= 4 |
|
|
|
# den = i |
|
|
|
# params['embedding_dims'] = dims[dim] |
|
|
|
# params['nb_filter'] = nb_filters[nb_fil] |
|
|
|
# params['nb_conv'] = nb_convs[conv] |
|
|
|
# params['nb_dense'] = nb_dense[den] |
|
|
|
|
|
|
|
# performanc_by_interpro() |
|
|
|
|
|
|
|
|
|
|
|
def load_data2(): |
|
|
|
all_data_x_fn = 'data2/all_data_X.csv' |
|
|
|
all_data_x = pd.read_csv(all_data_x_fn, sep='\t', header=0, index_col=0) |
|
|
|
all_proteins_train = [p.replace('"', '') for p in all_data_x.index] |
|
|
|
all_data_x.index = all_proteins_train |
|
|
|
all_data_y_fn = 'data2/all_data_Y.csv' |
|
|
|
all_data_y = pd.read_csv(all_data_y_fn, sep='\t', header=0, index_col=0) |
|
|
|
branch = pd.read_csv('data2/'+FUNCTION +'_branches.txt', sep='\t', header=0, index_col=0) |
|
|
|
all_x = all_data_x.values |
|
|
|
branches = [p for p in branch.index.tolist() if p in all_data_y.columns.tolist()] |
|
|
|
t= pd.DataFrame(all_data_y, columns=branches) |
|
|
|
all_y = t.values |
|
|
|
|
|
|
|
number_of_test = int(np.ceil(0.2 * len(all_x))) |
|
|
|
index = np.random.rand(1,number_of_test) |
|
|
|
index_test = [int(p) for p in np.ceil(index*len(all_x))[0] ] |
|
|
|
index_train = [p for p in range(len(all_x)) if p not in index_test] |
|
|
|
train_data = all_x[index_train, : ] #[ :20000, : ] |
|
|
|
test_data = all_x[index_test, : ] #[20000: , : ] |
|
|
|
train_labels = all_y[index_train, : ] #[ :20000, : ] |
|
|
|
test_labels = all_y[index_test, :] #[20000: , : ] |
|
|
|
val_data = test_data |
|
|
|
val_labels = test_labels |
|
|
|
#print(sum(sum(train_labels))) |
|
|
|
#print(train_data.shape) |
|
|
|
print(train_labels.shape) |
|
|
|
print(test_labels.shape) |
|
|
|
return train_data, train_labels, test_data, test_labels, val_data, val_labels |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def load_data(): |
|
|
|
|
|
|
|
df = pd.read_pickle(DATA_ROOT + 'train' + '-' + FUNCTION + '.pkl') |
|
|
|
n = len(df) |
|
|
|
index = df.index.values |
|
|
|
valid_n = int(n * 0.8) |
|
|
|
train_df = df.loc[index[:valid_n]] |
|
|
|
valid_df = df.loc[index[valid_n:]] |
|
|
|
test_df = pd.read_pickle(DATA_ROOT + 'test' + '-' + FUNCTION + '.pkl') |
|
|
|
print( test_df['orgs'] ) |
|
|
|
if ORG is not None: |
|
|
|
logging.info('Unfiltered test size: %d' % len(test_df)) |
|
|
|
test_df = test_df[test_df['orgs'] == ORG] |
|
|
|
logging.info('Filtered test size: %d' % len(test_df)) |
|
|
|
|
|
|
|
# Filter by type |
|
|
|
# org_df = pd.read_pickle('data/prokaryotes.pkl') |
|
|
|
# orgs = org_df['orgs'] |
|
|
|
# test_df = test_df[test_df['orgs'].isin(orgs)] |
|
|
|
|
|
|
|
def reshape(values): |
|
|
|
values = np.hstack(values).reshape( |
|
|
|
len(values), len(values[0])) |
|
|
|
return values |
|
|
|
|
|
|
|
def normalize_minmax(values): |
|
|
|
mn = np.min(values) |
|
|
|
mx = np.max(values) |
|
|
|
if mx - mn != 0.0: |
|
|
|
return (values - mn) / (mx - mn) |
|
|
|
return values - mn |
|
|
|
|
|
|
|
def get_values(data_frame): |
|
|
|
print(data_frame['labels'].values.shape) |
|
|
|
labels = reshape(data_frame['labels'].values) |
|
|
|
ngrams = sequence.pad_sequences( |
|
|
|
data_frame['ngrams'].values, maxlen=MAXLEN) |
|
|
|
ngrams = reshape(ngrams) |
|
|
|
rep = reshape(data_frame['embeddings'].values) |
|
|
|
data = ngrams |
|
|
|
return data, labels |
|
|
|
|
|
|
|
train = get_values(train_df) |
|
|
|
valid = get_values(valid_df) |
|
|
|
test = get_values(test_df) |
|
|
|
|
|
|
|
return train, valid, test, train_df, valid_df, test_df |
|
|
|
|
|
|
|
|
|
|
|
def get_feature_model(params): |
|
|
|
embedding_dims = params['embedding_dims'] |
|
|
|
max_features = 8001 |
|
|
|
model = Sequential() |
|
|
|
model.add(Embedding( |
|
|
|
max_features, |
|
|
|
embedding_dims, |
|
|
|
input_length=MAXLEN)) |
|
|
|
model.add(SpatialDropout1D(0.4)) |
|
|
|
for i in range(params['nb_conv']): |
|
|
|
model.add(Conv1D( |
|
|
|
activation="relu", |
|
|
|
padding="valid", |
|
|
|
strides=1, |
|
|
|
filters=params['nb_filter'], |
|
|
|
kernel_size=params['filter_length'])) |
|
|
|
model.add(MaxPooling1D(strides=params['stride'], pool_size=params['pool_length'])) |
|
|
|
model.add(Flatten()) |
|
|
|
return model |
|
|
|
|
|
|
|
|
|
|
|
def merge_outputs(outputs, name): |
|
|
|
if len(outputs) == 1: |
|
|
|
return outputs[0] |
|
|
|
## return merge(outputs, mode='concat', name=name, concat_axis=1) |
|
|
|
return Concatenate(axis=1, name=name)(outputs) |
|
|
|
|
|
|
|
|
|
|
|
def merge_nets(nets, name): |
|
|
|
if len(nets) == 1: |
|
|
|
return nets[0] |
|
|
|
## return merge(nets, mode='sum', name=name) |
|
|
|
return Add(name=name)(nets) |
|
|
|
|
|
|
|
def get_node_name(go_id, unique=False): |
|
|
|
name = go_id.split(':')[1] |
|
|
|
if not unique: |
|
|
|
return name |
|
|
|
if name not in node_names: |
|
|
|
node_names.add(name) |
|
|
|
return name |
|
|
|
i = 1 |
|
|
|
while (name + '_' + str(i)) in node_names: |
|
|
|
i += 1 |
|
|
|
name = name + '_' + str(i) |
|
|
|
node_names.add(name) |
|
|
|
return name |
|
|
|
|
|
|
|
|
|
|
|
def get_function_node(name, inputs): |
|
|
|
output_name = name + '_out' |
|
|
|
# net = Dense(256, name=name, activation='relu')(inputs) |
|
|
|
output = Dense(1, name=output_name, activation='sigmoid')(inputs) |
|
|
|
return output, output |
|
|
|
|
|
|
|
|
|
|
|
def get_generator(params, n_classes): |
|
|
|
inputs = Input(shape=(MAXLEN,), dtype='float32', name='input1') |
|
|
|
#feature_model = get_feature_model(params)(inputs) |
|
|
|
net0 = Dense(150, activation='relu')(inputs) |
|
|
|
net0 = Dense(150, activation='relu')(net0) |
|
|
|
#net0 = Dense(50, activation='relu')(net0) |
|
|
|
net = Dense(70, activation = 'relu')(net0) |
|
|
|
output = Dense(n_classes, activation='sigmoid')(net) |
|
|
|
model = Model(inputs=inputs, outputs=output) |
|
|
|
|
|
|
|
return model |
|
|
|
|
|
|
|
|
|
|
|
def get_discriminator(params, n_classes, dropout_rate=0.5): |
|
|
|
inputs = Input(shape=(n_classes, )) |
|
|
|
inputs2 = Input(shape =(MAXLEN,), dtype ='int32', name='d_input2') |
|
|
|
x2 = Embedding(8001,128, input_length=MAXLEN)(inputs2) |
|
|
|
x2 = Conv1D(filters =1 , kernel_size= 1, padding = 'valid', activation ='relu', strides=1)(x2) |
|
|
|
x2 = Lambda(lambda x: K.squeeze(x, 2))(x2) |
|
|
|
|
|
|
|
#for i in range(params['nb_conv']): |
|
|
|
# x2 = Conv1D ( activation="relu", padding="valid", strides=1, filters=params['nb_filter'],kernel_size=params['filter_length'])(x2) |
|
|
|
|
|
|
|
|
|
|
|
#x2 =MaxPooling1D(strides=params['stride'], pool_size=params['pool_length'])(x2) |
|
|
|
#x2 = Flatten()(x2) |
|
|
|
|
|
|
|
size = 40 |
|
|
|
x = inputs |
|
|
|
x = Dropout(dropout_rate)(x) |
|
|
|
x = Dense(size)(x) |
|
|
|
x = BatchNormalization()(x) |
|
|
|
x = Activation('relu')(x) |
|
|
|
|
|
|
|
|
|
|
|
size = 40 |
|
|
|
x2 = Dropout(dropout_rate)(x2) |
|
|
|
x2 = Dense(size)(x2) |
|
|
|
x2 = BatchNormalization()(x2) |
|
|
|
x2 = Activation('relu')(x2) |
|
|
|
|
|
|
|
|
|
|
|
x = Concatenate(axis =1 , name = 'merged2')([x, x2]) |
|
|
|
layer_sizes = [80, 40,30] |
|
|
|
for size in layer_sizes: |
|
|
|
x = Dropout(dropout_rate)(x) |
|
|
|
x = Dense(size)(x) |
|
|
|
x = BatchNormalization()(x) |
|
|
|
x = Activation('relu')(x) |
|
|
|
|
|
|
|
|
|
|
|
outputs = Dense(1)(x) |
|
|
|
model = Model(inputs = [inputs ,inputs2], outputs=outputs, name='Discriminator') |
|
|
|
|
|
|
|
return model |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_model(params,nb_classes, batch_size, GRADIENT_PENALTY_WEIGHT=10): |
|
|
|
generator = get_generator(params, nb_classes) |
|
|
|
discriminator = get_discriminator(params, nb_classes) |
|
|
|
|
|
|
|
generator_model, discriminator_model = \ |
|
|
|
WGAN_wrapper(generator=generator, |
|
|
|
discriminator=discriminator, |
|
|
|
generator_input_shape=(MAXLEN,), |
|
|
|
discriminator_input_shape=(nb_classes,), |
|
|
|
discriminator_input_shape2 = (MAXLEN, ), |
|
|
|
batch_size=batch_size, |
|
|
|
gradient_penalty_weight=GRADIENT_PENALTY_WEIGHT) |
|
|
|
|
|
|
|
logging.info('Compilation finished') |
|
|
|
return generator_model, discriminator_model |
|
|
|
|
|
|
|
|
|
|
|
def train_wgan(generator_model, discriminator_model, batch_size, epochs, |
|
|
|
x_train, y_train, x_val, y_val, generator_model_path, discriminator_model_path, |
|
|
|
TRAINING_RATIO=10, N_WARM_UP=0): |
|
|
|
BATCH_SIZE = batch_size |
|
|
|
N_EPOCH = epochs |
|
|
|
|
|
|
|
|
|
|
|
positive_y = np.ones((batch_size, 1), dtype=np.float32) |
|
|
|
zero_y = positive_y * 0 |
|
|
|
negative_y = -positive_y |
|
|
|
positive_full_y = np.ones((BATCH_SIZE * TRAINING_RATIO, 1), dtype=np.float32) |
|
|
|
dummy_y = np.zeros((BATCH_SIZE, 1), dtype=np.float32) |
|
|
|
|
|
|
|
positive_full_enable_train = np.ones((len(x_train), 1), dtype = np.float32 ) |
|
|
|
positive_full_enable_val = np.ones((len(x_val), 1), dtype =np.float32 ) |
|
|
|
#positive_enable_train = np.ones((1, batch_size),dtype = np.float32 ) |
|
|
|
#positive_full_train_enable = np.ones((1,BATCH_SIZE * TRAINING_RATIO ), dtype=np.float32 ) |
|
|
|
best_validation_loss = None |
|
|
|
|
|
|
|
for epoch in range(N_EPOCH): |
|
|
|
# np.random.shuffle(X_train) |
|
|
|
print("Epoch: ", epoch) |
|
|
|
print("Number of batches: ", int(y_train.shape[0] // BATCH_SIZE)) |
|
|
|
discriminator_loss = [] |
|
|
|
generator_loss = [] |
|
|
|
minibatches_size = BATCH_SIZE * TRAINING_RATIO |
|
|
|
|
|
|
|
shuffled_indexes = np.random.permutation(x_train.shape[0]) |
|
|
|
shuffled_indexes_2 = np.random.permutation(x_train.shape[0]) |
|
|
|
|
|
|
|
for i in range(int(y_train.shape[0] // (BATCH_SIZE * TRAINING_RATIO))): |
|
|
|
batch_indexes = shuffled_indexes[i * minibatches_size:(i + 1) * minibatches_size] |
|
|
|
batch_indexes_2 = shuffled_indexes_2[i * minibatches_size:(i + 1) * minibatches_size] |
|
|
|
x = x_train[batch_indexes] |
|
|
|
y = y_train[batch_indexes] |
|
|
|
y_2 = y_train[batch_indexes_2] |
|
|
|
x_2 = x_train[batch_indexes_2] |
|
|
|
if epoch < N_WARM_UP: |
|
|
|
for j in range(TRAINING_RATIO): |
|
|
|
x_batch = x[j * BATCH_SIZE:(j + 1) * BATCH_SIZE] |
|
|
|
y_batch = y[j * BATCH_SIZE:(j + 1) * BATCH_SIZE] |
|
|
|
|
|
|
|
generator_loss.append(generator_model.train_on_batch([x_batch, positive_y], [y_batch, zero_y])) |
|
|
|
else: |
|
|
|
for j in range(TRAINING_RATIO): |
|
|
|
x_batch = x[j * BATCH_SIZE:(j + 1) * BATCH_SIZE] |
|
|
|
y_batch_2 = y_2[j * BATCH_SIZE:(j + 1) * BATCH_SIZE] |
|
|
|
x_batch_2 = x_2[j * BATCH_SIZE:(j + 1) * BATCH_SIZE] |
|
|
|
# noise = np.random.rand(BATCH_SIZE, 100).astype(np.float32) |
|
|
|
noise = x_batch |
|
|
|
#print(sum(y_batch_2)) |
|
|
|
discriminator_loss.append(discriminator_model.train_on_batch( |
|
|
|
[y_batch_2, noise, x_batch_2 ], |
|
|
|
[positive_y, negative_y, dummy_y])) |
|
|
|
|
|
|
|
generator_loss.append(generator_model.train_on_batch([x,positive_full_y], [y, positive_full_y])) |
|
|
|
# Still needs some code to display losses from the generator and discriminator, progress bars, etc. |
|
|
|
predicted_y_train, _ = generator_model.predict([x_train , positive_full_enable_train], batch_size=BATCH_SIZE) |
|
|
|
predicted_y_val, _ = generator_model.predict([ x_val , positive_full_enable_val ], batch_size=BATCH_SIZE) |
|
|
|
|
|
|
|
#print(sum(sum(positive_full_enable_train))) |
|
|
|
#print(predicted_y_train) |
|
|
|
train_loss = log_loss(y_train, predicted_y_train) |
|
|
|
val_loss = log_loss(y_val, predicted_y_val) |
|
|
|
|
|
|
|
print("train loss: {:.4f}, validation loss: {:.4f}, discriminator loss: {:.4f}".format( |
|
|
|
train_loss, val_loss, |
|
|
|
(np.sum(np.asarray(discriminator_loss)) if discriminator_loss else -1) / x_train.shape[0])) |
|
|
|
|
|
|
|
if best_validation_loss is None or best_validation_loss > val_loss: |
|
|
|
print('\nEpoch %05d: improved from %0.5f,' |
|
|
|
' saving model to %s and %s' |
|
|
|
% (epoch + 1, val_loss, generator_model_path, discriminator_model_path)) |
|
|
|
|
|
|
|
best_validation_loss = val_loss |
|
|
|
generator_model.save(generator_model_path, overwrite=True) |
|
|
|
discriminator_model.save(discriminator_model_path, overwrite=True) |
|
|
|
|
|
|
|
|
|
|
|
def model(params, batch_size=20, nb_epoch=40, is_train=True): |
|
|
|
# set parameters: |
|
|
|
#nb_classes = len(functions) |
|
|
|
start_time = time.time() |
|
|
|
logging.info("Loading Data") |
|
|
|
|
|
|
|
## |
|
|
|
#train, val, test, train_df, valid_df, test_df = load_data() |
|
|
|
#train_df = pd.concat([train_df, valid_df]) |
|
|
|
#test_gos = test_df['gos'].values |
|
|
|
#train_data, train_labels = train |
|
|
|
#val_data, val_labels = val |
|
|
|
#test_data, test_labels = test |
|
|
|
## |
|
|
|
|
|
|
|
train_data, train_labels, test_data, test_labels, val_data, val_labels = load_data2() |
|
|
|
nb_classes = train_labels.shape[1] |
|
|
|
|
|
|
|
logging.info("Data loaded in %d sec" % (time.time() - start_time)) |
|
|
|
logging.info("Training data size: %d" % len(train_data)) |
|
|
|
logging.info("Validation data size: %d" % len(val_data)) |
|
|
|
logging.info("Test data size: %d" % len(test_data)) |
|
|
|
generator_model_path = DATA_ROOT + 'models/new_model_seq_' + FUNCTION + '.h5' |
|
|
|
discriminator_model_path = DATA_ROOT + 'models/new_model_disc_seq_' + FUNCTION + '.h5' |
|
|
|
|
|
|
|
|
|
|
|
logging.info('Starting training the model') |
|
|
|
|
|
|
|
train_generator = DataGenerator(batch_size, nb_classes) |
|
|
|
train_generator.fit(train_data, train_labels) |
|
|
|
valid_generator = DataGenerator(batch_size, nb_classes) |
|
|
|
valid_generator.fit(val_data, val_labels) |
|
|
|
test_generator = DataGenerator(batch_size, nb_classes) |
|
|
|
test_generator.fit(test_data, test_labels) |
|
|
|
|
|
|
|
if is_train: |
|
|
|
generator_model, discriminator_model = get_model(params, nb_classes, batch_size) |
|
|
|
train_wgan(generator_model, discriminator_model, batch_size=batch_size, epochs=nb_epoch, |
|
|
|
x_train=train_data, y_train=train_labels, x_val=val_data, y_val=val_labels, |
|
|
|
generator_model_path=generator_model_path, |
|
|
|
discriminator_model_path=discriminator_model_path) |
|
|
|
|
|
|
|
logging.info('Loading best model') |
|
|
|
model = load_model(generator_model_path, |
|
|
|
custom_objects={'generator_recunstruction_loss_new': generator_recunstruction_loss_new, |
|
|
|
'wasserstein_loss': wasserstein_loss}) |
|
|
|
|
|
|
|
logging.info('Predicting') |
|
|
|
preds = model.predict_generator(test_generator, steps=len(test_data) / batch_size)[0] |
|
|
|
|
|
|
|
# incon = 0 |
|
|
|
# for i in xrange(len(test_data)): |
|
|
|
# for j in xrange(len(functions)): |
|
|
|
# childs = set(go[functions[j]]['children']).intersection(func_set) |
|
|
|
# ok = True` |
|
|
|
# for n_id in childs: |
|
|
|
# if preds[i, j] < preds[i, go_indexes[n_id]]: |
|
|
|
# preds[i, j] = preds[i, go_indexes[n_id]] |
|
|
|
# ok = False |
|
|
|
# if not ok: |
|
|
|
# incon += 1 |
|
|
|
logging.info('Computing performance') |
|
|
|
f, p, r, t, preds_max = compute_performance(preds, test_labels) #, test_gos) |
|
|
|
roc_auc = compute_roc(preds, test_labels) |
|
|
|
mcc = compute_mcc(preds_max, test_labels) |
|
|
|
aupr , _ = compute_aupr(preds, test_labels) |
|
|
|
m_pr_max, m_rc_max, m_f1_max, M_pr_max, M_rc_max, M_f1_max = micro_macro_function_centric_f1(preds.T, test_labels.T) |
|
|
|
|
|
|
|
logging.info('Protein centric macro Th, PR, RC, F1: \t %f %f %f %f' % (t, p, r, f)) |
|
|
|
logging.info('ROC AUC: \t %f ' % (roc_auc, )) |
|
|
|
logging.info('MCC: \t %f ' % (mcc, )) |
|
|
|
logging.info('AUPR: \t %f ' % (aupr, )) |
|
|
|
logging.info('Function centric macro PR, RC, F1: \t %f %f %f' % (M_pr_max, M_rc_max, M_f1_max) ) |
|
|
|
logging.info('Function centric micro PR, RC, F1: \t %f %f %f' % (m_pr_max, m_rc_max, m_f1_max) ) |
|
|
|
|
|
|
|
function_centric_performance(functions, preds.T, test_labels.T, train_labels.T) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def load_prot_ipro(): |
|
|
|
proteins = list() |
|
|
|
ipros = list() |
|
|
|
with open(DATA_ROOT + 'swissprot_ipro.tab') as f: |
|
|
|
for line in f: |
|
|
|
it = line.strip().split('\t') |
|
|
|
if len(it) != 3: |
|
|
|
continue |
|
|
|
prot = it[1] |
|
|
|
iprs = it[2].split(';') |
|
|
|
proteins.append(prot) |
|
|
|
ipros.append(iprs) |
|
|
|
return pd.DataFrame({'proteins': proteins, 'ipros': ipros}) |
|
|
|
|
|
|
|
|
|
|
|
def performanc_by_interpro(): |
|
|
|
pred_df = pd.read_pickle(DATA_ROOT + 'test-' + FUNCTION + '-preds.pkl') |
|
|
|
ipro_df = load_prot_ipro() |
|
|
|
df = pred_df.merge(ipro_df, on='proteins', how='left') |
|
|
|
ipro = get_ipro() |
|
|
|
|
|
|
|
def reshape(values): |
|
|
|
values = np.hstack(values).reshape( |
|
|
|
len(values), len(values[0])) |
|
|
|
return values |
|
|
|
|
|
|
|
for ipro_id in ipro: |
|
|
|
if len(ipro[ipro_id]['parents']) > 0: |
|
|
|
continue |
|
|
|
labels = list() |
|
|
|
predictions = list() |
|
|
|
gos = list() |
|
|
|
for i, row in df.iterrows(): |
|
|
|
if not isinstance(row['ipros'], list): |
|
|
|
continue |
|
|
|
if ipro_id in row['ipros']: |
|
|
|
labels.append(row['labels']) |
|
|
|
predictions.append(row['predictions']) |
|
|
|
gos.append(row['gos']) |
|
|
|
pr = 0 |
|
|
|
rc = 0 |
|
|
|
total = 0 |
|
|
|
p_total = 0 |
|
|
|
for i in range(len(labels)): |
|
|
|
tp = np.sum(labels[i] * predictions[i]) |
|
|
|
fp = np.sum(predictions[i]) - tp |
|
|
|
fn = np.sum(labels[i]) - tp |
|
|
|
all_gos = set() |
|
|
|
for go_id in gos[i]: |
|
|
|
if go_id in all_functions: |
|
|
|
all_gos |= get_anchestors(go, go_id) |
|
|
|
all_gos.discard(GO_ID) |
|
|
|
all_gos -= func_set |
|
|
|
fn += len(all_gos) |
|
|
|
if tp == 0 and fp == 0 and fn == 0: |
|
|
|
continue |
|
|
|
total += 1 |
|
|
|
if tp != 0: |
|
|
|
p_total += 1 |
|
|
|
precision = tp / (1.0 * (tp + fp)) |
|
|
|
recall = tp / (1.0 * (tp + fn)) |
|
|
|
pr += precision |
|
|
|
rc += recall |
|
|
|
if total > 0 and p_total > 0: |
|
|
|
rc /= total |
|
|
|
pr /= p_total |
|
|
|
if pr + rc > 0: |
|
|
|
f = 2 * pr * rc / (pr + rc) |
|
|
|
logging.info('%s\t%d\t%f\t%f\t%f' % ( |
|
|
|
ipro_id, len(labels), f, pr, rc)) |
|
|
|
|
|
|
|
|
|
|
|
def function_centric_performance(functions, preds, labels, labels_train): |
|
|
|
results = [] |
|
|
|
preds = np.round(preds, 2) |
|
|
|
for i in range(preds.shape[0]): |
|
|
|
f_max = 0 |
|
|
|
p_max = 0 |
|
|
|
r_max = 0 |
|
|
|
for t in range(1, 100): |
|
|
|
threshold = t / 100.0 |
|
|
|
predictions = (preds[i, :] > threshold).astype(np.int32) |
|
|
|
tp = np.sum(predictions * labels[i, :]) |
|
|
|
fp = np.sum(predictions) - tp |
|
|
|
fn = np.sum(labels[i, :]) - tp |
|
|
|
if tp > 0: |
|
|
|
precision = tp / (1.0 * (tp + fp)) |
|
|
|
recall = tp / (1.0 * (tp + fn)) |
|
|
|
f = 2 * precision * recall / (precision + recall) |
|
|
|
else: |
|
|
|
if fp == 0 and fn == 0: |
|
|
|
precision = 1 |
|
|
|
recall = 1 |
|
|
|
f = 1 |
|
|
|
else: |
|
|
|
precision = 0 |
|
|
|
recall = 0 |
|
|
|
f = 0 |
|
|
|
|
|
|
|
if f_max < f: |
|
|
|
f_max = f |
|
|
|
p_max = precision |
|
|
|
r_max = recall |
|
|
|
num_prots_train = np.sum(labels_train[i, :]) |
|
|
|
height = get_height(go, functions[i]) |
|
|
|
results.append([functions[i], num_prots_train, height, f_max, p_max, r_max]) |
|
|
|
results = pd.DataFrame(results) |
|
|
|
results.to_csv('Con_GodGanSeq_results_' + FUNCTION + '.txt', sep='\t', index=False) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def function_centric_performance_backup(functions, preds, labels, labels_train): |
|
|
|
results = [] |
|
|
|
preds = np.round(preds, 2) |
|
|
|
for i in range(len(functions)): |
|
|
|
f_max = 0 |
|
|
|
p_max = 0 |
|
|
|
r_max = 0 |
|
|
|
x = list() |
|
|
|
y = list() |
|
|
|
total = 0 |
|
|
|
for t in range(1, 100): |
|
|
|
threshold = t / 100.0 |
|
|
|
predictions = (preds[i, :] > threshold).astype(np.int32) |
|
|
|
tp = np.sum(predictions * labels[i, :]) |
|
|
|
fp = np.sum(predictions) - tp |
|
|
|
fn = np.sum(labels[i, :]) - tp |
|
|
|
if tp >0: |
|
|
|
sn = tp / (1.0 * np.sum(labels[i, :])) |
|
|
|
sp = np.sum((predictions ^ 1) * (labels[i, :] ^ 1)) |
|
|
|
sp /= 1.0 * np.sum(labels[i, :] ^ 1) |
|
|
|
fpr = 1 - sp |
|
|
|
x.append(fpr) |
|
|
|
y.append(sn) |
|
|
|
precision = tp / (1.0 * (tp + fp)) |
|
|
|
recall = tp / (1.0 * (tp + fn)) |
|
|
|
f = 2 * precision * recall / (precision + recall) |
|
|
|
total +=1 |
|
|
|
if f_max < f: |
|
|
|
f_max = f |
|
|
|
p_max = precision |
|
|
|
r_max = recall |
|
|
|
num_prots = np.sum(labels[i, :]) |
|
|
|
num_prots_train = np.sum(labels_train[i,:]) |
|
|
|
if total >1 : |
|
|
|
roc_auc = auc(x, y) |
|
|
|
else: |
|
|
|
roc_auc =0 |
|
|
|
height = get_height(go , functions[i]) |
|
|
|
results.append([functions[i], f_max, p_max, r_max, num_prots, num_prots_train, height,roc_auc]) |
|
|
|
results = pd.DataFrame(results) |
|
|
|
#results.to_csv('new_results.txt' , sep='\t' , index = False) |
|
|
|
results.to_csv('Con_GodGanSeq_results_'+FUNCTION +'.txt', sep='\t', index=False) |
|
|
|
#results = np.array(results) |
|
|
|
#p_mean = (np.sum(results[:,2])) / len(functions) |
|
|
|
#r_mean = (np.sum(results[:,3])) / len(functions) |
|
|
|
#f_mean = (2*p_mean*r_mean)/(p_mean+r_mean) |
|
|
|
#roc_auc_mean = (np.sum(results[:,7])) / len(functions) |
|
|
|
#print('Function centric performance (macro) ' '%f %f %f %f' % (f_mean, p_mean, r_mean, roc_auc_mean)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def micro_macro_function_centric_f1_backup(preds, labels): |
|
|
|
preds = np.round(preds, 2) |
|
|
|
m_f1_max = 0 |
|
|
|
M_f1_max = 0 |
|
|
|
for t in range(1, 100): |
|
|
|
threshold = t / 100.0 |
|
|
|
predictions = (preds > threshold).astype(np.int32) |
|
|
|
m_tp = 0 |
|
|
|
m_fp = 0 |
|
|
|
m_fn = 0 |
|
|
|
M_pr = 0 |
|
|
|
M_rc = 0 |
|
|
|
total = 0 |
|
|
|
p_total = 0 |
|
|
|
for i in range(len(preds)): |
|
|
|
tp = np.sum(predictions[i, :] * labels[i, :]) |
|
|
|
fp = np.sum(predictions[i, :]) - tp |
|
|
|
fn = np.sum(labels[i, :]) - tp |
|
|
|
if tp == 0 and fp == 0 and fn == 0: |
|
|
|
continue |
|
|
|
total += 1 |
|
|
|
if tp > 0: |
|
|
|
pr = tp / (1.0 * (tp + fp)) |
|
|
|
rc = tp / (1.0 * (tp + fn)) |
|
|
|
m_tp += tp |
|
|
|
m_fp += fp |
|
|
|
m_fn += fn |
|
|
|
M_pr += pr |
|
|
|
M_rc += rc |
|
|
|
p_total += 1 |
|
|
|
|
|
|
|
if p_total == 0: |
|
|
|
continue |
|
|
|
if total > 0: |
|
|
|
m_tp /= total |
|
|
|
m_fn /= total |
|
|
|
m_fp /= total |
|
|
|
m_pr = m_tp / (1.0 * (m_tp + m_fp)) |
|
|
|
m_rc = m_tp / (1.0 * (m_tp + m_fn)) |
|
|
|
M_pr /= p_total |
|
|
|
M_rc /= total |
|
|
|
m_f1 = 2 * m_pr * m_rc / (m_pr + m_rc) |
|
|
|
M_f1 = 2 * M_pr * M_rc / (M_pr + M_rc) |
|
|
|
|
|
|
|
if m_f1 > m_f1_max: |
|
|
|
m_f1_max = m_f1 |
|
|
|
m_pr_max = m_pr |
|
|
|
m_rc_max = m_rc |
|
|
|
|
|
|
|
if M_f1 > M_f1_max: |
|
|
|
M_f1_max = M_f1 |
|
|
|
M_pr_max = M_pr |
|
|
|
M_rc_max = M_rc |
|
|
|
|
|
|
|
return m_pr_max, m_rc_max, m_f1_max, M_pr_max, M_rc_max, M_f1_max |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def micro_macro_function_centric_f1(preds, labels): |
|
|
|
preds = np.round(preds, 2) |
|
|
|
m_f1_max = 0 |
|
|
|
M_f1_max = 0 |
|
|
|
for t in range(1, 200): |
|
|
|
threshold = t / 200.0 |
|
|
|
predictions = (preds > threshold).astype(np.int32) |
|
|
|
m_tp = 0 |
|
|
|
m_fp = 0 |
|
|
|
m_fn = 0 |
|
|
|
M_pr = 0 |
|
|
|
M_rc = 0 |
|
|
|
for i in range(preds.shape[0]): |
|
|
|
tp = np.sum(predictions[i, :] * labels[i, :]) |
|
|
|
fp = np.sum(predictions[i, :]) - tp |
|
|
|
fn = np.sum(labels[i, :]) - tp |
|
|
|
m_tp += tp |
|
|
|
m_fp += fp |
|
|
|
m_fn += fn |
|
|
|
if tp > 0: |
|
|
|
pr = 1.0 * tp / (1.0 * (tp + fp)) |
|
|
|
rc = 1.0 * tp / (1.0 * (tp + fn)) |
|
|
|
else: |
|
|
|
if fp == 0 and fn == 0: |
|
|
|
pr = 1 |
|
|
|
rc = 1 |
|
|
|
else: |
|
|
|
pr = 0 |
|
|
|
rc = 0 |
|
|
|
M_pr += pr |
|
|
|
M_rc += rc |
|
|
|
|
|
|
|
if m_tp > 0: |
|
|
|
m_pr = 1.0 * m_tp / (1.0 * (m_tp + m_fp)) |
|
|
|
m_rc = 1.0 * m_tp / (1.0 * (m_tp + m_fn)) |
|
|
|
m_f1 = 2.0 * m_pr * m_rc / (m_pr + m_rc) |
|
|
|
else: |
|
|
|
if m_fp == 0 and m_fn == 0: |
|
|
|
m_pr = 1 |
|
|
|
m_rc = 1 |
|
|
|
m_f1 = 1 |
|
|
|
else: |
|
|
|
m_pr = 0 |
|
|
|
m_rc = 0 |
|
|
|
m_f1 = 0 |
|
|
|
|
|
|
|
M_pr /= preds.shape[0] |
|
|
|
M_rc /= preds.shape[0] |
|
|
|
if M_pr == 0 and M_rc == 0: |
|
|
|
M_f1 = 0 |
|
|
|
else: |
|
|
|
M_f1 = 2.0 * M_pr * M_rc / (M_pr + M_rc) |
|
|
|
if m_f1 > m_f1_max: |
|
|
|
m_f1_max = m_f1 |
|
|
|
m_pr_max = m_pr |
|
|
|
m_rc_max = m_rc |
|
|
|
|
|
|
|
if M_f1 > M_f1_max: |
|
|
|
M_f1_max = M_f1 |
|
|
|
M_pr_max = M_pr |
|
|
|
M_rc_max = M_rc |
|
|
|
|
|
|
|
return m_pr_max, m_rc_max, m_f1_max, M_pr_max, M_rc_max, M_f1_max |
|
|
|
|
|
|
|
|
|
|
|
def compute_roc(preds, labels): |
|
|
|
# Compute ROC curve and ROC area for each class |
|
|
|
fpr, tpr, _ = roc_curve(labels.flatten(), preds.flatten()) |
|
|
|
roc_auc = auc(fpr, tpr) |
|
|
|
return roc_auc |
|
|
|
|
|
|
|
|
|
|
|
def compute_aupr(preds, labels): |
|
|
|
# Compute ROC curve and ROC area for each class |
|
|
|
pr, rc, threshold =precision_recall_curve(labels.flatten(), preds.flatten()) |
|
|
|
pr_auc = auc(rc, pr) |
|
|
|
#pr, rc, threshold =precision_recall_curve(labels.flatten(), preds.flatten(),average ='macro' ) |
|
|
|
M_pr_auc = 0 |
|
|
|
return pr_auc, M_pr_auc |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def compute_mcc(preds, labels): |
|
|
|
# Compute ROC curve and ROC area for each class |
|
|
|
mcc = matthews_corrcoef(labels.flatten(), preds.flatten()) |
|
|
|
return mcc |
|
|
|
|
|
|
|
|
|
|
|
def compute_performance(preds, labels): #, gos): |
|
|
|
preds = np.round(preds, 2) |
|
|
|
f_max = 0 |
|
|
|
p_max = 0 |
|
|
|
r_max = 0 |
|
|
|
t_max = 0 |
|
|
|
for t in range(1, 100): |
|
|
|
threshold = t / 100.0 |
|
|
|
predictions = (preds > threshold).astype(np.int32) |
|
|
|
total = 0 |
|
|
|
f = 0.0 |
|
|
|
p = 0.0 |
|
|
|
r = 0.0 |
|
|
|
p_total = 0 |
|
|
|
for i in range(labels.shape[0]): |
|
|
|
tp = np.sum(predictions[i, :] * labels[i, :]) |
|
|
|
fp = np.sum(predictions[i, :]) - tp |
|
|
|
fn = np.sum(labels[i, :]) - tp |
|
|
|
all_gos = set() |
|
|
|
#for go_id in gos[i]: |
|
|
|
# if go_id in all_functions: |
|
|
|
# all_gos |= get_anchestors(go, go_id) |
|
|
|
#all_gos.discard(GO_ID) |
|
|
|
#all_gos -= func_set |
|
|
|
#fn += len(all_gos) |
|
|
|
if tp == 0 and fp == 0 and fn == 0: |
|
|
|
continue |
|
|
|
total += 1 |
|
|
|
if tp != 0: |
|
|
|
p_total += 1 |
|
|
|
precision = tp / (1.0 * (tp + fp)) |
|
|
|
recall = tp / (1.0 * (tp + fn)) |
|
|
|
p += precision |
|
|
|
r += recall |
|
|
|
if p_total == 0: |
|
|
|
continue |
|
|
|
r /= total |
|
|
|
p /= p_total |
|
|
|
if p + r > 0: |
|
|
|
f = 2 * p * r / (p + r) |
|
|
|
if f_max < f: |
|
|
|
f_max = f |
|
|
|
p_max = p |
|
|
|
r_max = r |
|
|
|
t_max = threshold |
|
|
|
predictions_max = predictions |
|
|
|
return f_max, p_max, r_max, t_max, predictions_max |
|
|
|
|
|
|
|
|
|
|
|
def get_gos(pred): |
|
|
|
mdist = 1.0 |
|
|
|
mgos = None |
|
|
|
for i in range(len(labels_gos)): |
|
|
|
labels, gos = labels_gos[i] |
|
|
|
dist = distance.cosine(pred, labels) |
|
|
|
if mdist > dist: |
|
|
|
mdist = dist |
|
|
|
mgos = gos |
|
|
|
return mgos |
|
|
|
|
|
|
|
|
|
|
|
def compute_similarity_performance(train_df, test_df, preds): |
|
|
|
logging.info("Computing similarity performance") |
|
|
|
logging.info("Training data size %d" % len(train_df)) |
|
|
|
train_labels = train_df['labels'].values |
|
|
|
train_gos = train_df['gos'].values |
|
|
|
global labels_gos |
|
|
|
labels_gos = zip(train_labels, train_gos) |
|
|
|
p = Pool(64) |
|
|
|
pred_gos = p.map(get_gos, preds) |
|
|
|
total = 0 |
|
|
|
p = 0.0 |
|
|
|
r = 0.0 |
|
|
|
f = 0.0 |
|
|
|
test_gos = test_df['gos'].values |
|
|
|
for gos, tgos in zip(pred_gos, test_gos): |
|
|
|
preds = set() |
|
|
|
test = set() |
|
|
|
for go_id in gos: |
|
|
|
if go_id in all_functions: |
|
|
|
preds |= get_anchestors(go, go_id) |
|
|
|
for go_id in tgos: |
|
|
|
if go_id in all_functions: |
|
|
|
test |= get_anchestors(go, go_id) |
|
|
|
tp = len(preds.intersection(test)) |
|
|
|
fp = len(preds - test) |
|
|
|
fn = len(test - preds) |
|
|
|
if tp == 0 and fp == 0 and fn == 0: |
|
|
|
continue |
|
|
|
total += 1 |
|
|
|
if tp != 0: |
|
|
|
precision = tp / (1.0 * (tp + fp)) |
|
|
|
recall = tp / (1.0 * (tp + fn)) |
|
|
|
p += precision |
|
|
|
r += recall |
|
|
|
f += 2 * precision * recall / (precision + recall) |
|
|
|
return f / total, p / total, r / total |
|
|
|
|
|
|
|
|
|
|
|
def print_report(report, go_id): |
|
|
|
with open(DATA_ROOT + 'reports.txt', 'a') as f: |
|
|
|
f.write('Classification report for ' + go_id + '\n') |
|
|
|
f.write(report + '\n') |
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
|
|
main() |