123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917 |
- #!/usr/bin/env python
-
- """
-
- """
- from __future__ import division
-
- import logging
- import sys
- import time
- from collections import deque
- from multiprocessing import Pool
-
- import click as ck
- import numpy as np
- import pandas as pd
- import tensorflow as tf
- from keras import backend as K
- from keras.callbacks import EarlyStopping, ModelCheckpoint
- from keras.layers import (
- Dense, Input, SpatialDropout1D, Conv1D, MaxPooling1D,
- Flatten, Concatenate, Add, Maximum, Embedding, BatchNormalization, Activation, Dropout)
- from keras.losses import binary_crossentropy
- from keras.models import Sequential, Model, load_model
- from keras.preprocessing import sequence
- from scipy.spatial import distance
- from sklearn.metrics import log_loss
- from sklearn.metrics import roc_curve, auc, matthews_corrcoef
- from keras.layers import Lambda
- from sklearn.metrics import precision_recall_curve
-
- from utils import (
- get_gene_ontology,
- get_go_set,
- get_anchestors,
- get_parents,
- DataGenerator,
- FUNC_DICT,
- get_height,
- get_ipro)
- from conditional_wgan_wrapper_post import WGAN_wrapper, wasserstein_loss, generator_recunstruction_loss_new
-
- config = tf.ConfigProto()
- config.gpu_options.allow_growth = True
- sess = tf.Session(config=config)
- K.set_session(sess)
-
- logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)
- sys.setrecursionlimit(100000)
-
- DATA_ROOT = 'data/swiss/'
- MAXLEN = 258 #1000
- REPLEN = 256
- ind = 0
-
-
- @ck.command()
- @ck.option(
- '--function',
- default='bp',
- help='Ontology id (mf, bp, cc)')
- @ck.option(
- '--device',
- default='gpu:0',
- help='GPU or CPU device id')
- @ck.option(
- '--org',
- default= None,
- help='Organism id for filtering test set')
- @ck.option('--train',default = True, is_flag=True)
- @ck.option('--param', default=0, help='Param index 0-7')
- def main(function, device, org, train, param):
- global FUNCTION
- FUNCTION = function
- global GO_ID
- GO_ID = FUNC_DICT[FUNCTION]
- global go
- go = get_gene_ontology('go.obo')
- global ORG
- ORG = org
- func_df = pd.read_pickle(DATA_ROOT + FUNCTION + '.pkl')
- global functions
- functions = func_df['functions'].values
- global func_set
- func_set = set(functions)
- global all_functions
- all_functions = get_go_set(go, GO_ID)
- logging.info('Functions: %s %d' % (FUNCTION, len(functions)))
- if ORG is not None:
- logging.info('Organism %s' % ORG)
- global go_indexes
- go_indexes = dict()
- for ind, go_id in enumerate(functions):
- go_indexes[go_id] = ind
- global node_names
- node_names = set()
- with tf.device('/' + device):
- params = {
- 'fc_output': 1024,
- 'learning_rate': 0.001,
- 'embedding_dims': 128,
- 'embedding_dropout': 0.2,
- 'nb_conv': 1,
- 'nb_dense': 1,
- 'filter_length': 128,
- 'nb_filter': 32,
- 'pool_length': 64,
- 'stride': 32
- }
- model(params, is_train=train)
- #dims = [64, 128, 256, 512]
- #nb_filters = [16, 32, 64, 128]
- #nb_convs = [1, 2, 3, 4]
- #nb_dense = [1, 2, 3, 4]
- #for i in range(param * 32, param * 32 + 32):
- # dim = i % 4
- # i = i / 4
- # nb_fil = i % 4
- # i /= 4
- # conv = i % 4
- # i /= 4
- # den = i
- # params['embedding_dims'] = dims[dim]
- # params['nb_filter'] = nb_filters[nb_fil]
- # params['nb_conv'] = nb_convs[conv]
- # params['nb_dense'] = nb_dense[den]
-
- # performanc_by_interpro()
-
-
- def load_data2():
- all_data_x_fn = 'data2/all_data_X.csv'
- all_data_x = pd.read_csv(all_data_x_fn, sep='\t', header=0, index_col=0)
- all_proteins_train = [p.replace('"', '') for p in all_data_x.index]
- all_data_x.index = all_proteins_train
- all_data_y_fn = 'data2/all_data_Y.csv'
- all_data_y = pd.read_csv(all_data_y_fn, sep='\t', header=0, index_col=0)
- branch = pd.read_csv('data2/'+FUNCTION +'_branches.txt', sep='\t', header=0, index_col=0)
- all_x = all_data_x.values
- branches = [p for p in branch.index.tolist() if p in all_data_y.columns.tolist()]
- t= pd.DataFrame(all_data_y, columns=branches)
- all_y = t.values
-
- number_of_test = int(np.ceil(0.2 * len(all_x)))
- index = np.random.rand(1,number_of_test)
- index_test = [int(p) for p in np.ceil(index*len(all_x))[0] ]
- index_train = [p for p in range(len(all_x)) if p not in index_test]
- train_data = all_x[index_train, : ] #[ :20000, : ]
- test_data = all_x[index_test, : ] #[20000: , : ]
- train_labels = all_y[index_train, : ] #[ :20000, : ]
- test_labels = all_y[index_test, :] #[20000: , : ]
- val_data = test_data
- val_labels = test_labels
- #print(sum(sum(train_labels)))
- #print(train_data.shape)
- print(train_labels.shape)
- print(test_labels.shape)
- return train_data, train_labels, test_data, test_labels, val_data, val_labels
-
-
-
-
- def load_data():
-
- df = pd.read_pickle(DATA_ROOT + 'train' + '-' + FUNCTION + '.pkl')
- n = len(df)
- index = df.index.values
- valid_n = int(n * 0.8)
- train_df = df.loc[index[:valid_n]]
- valid_df = df.loc[index[valid_n:]]
- test_df = pd.read_pickle(DATA_ROOT + 'test' + '-' + FUNCTION + '.pkl')
- print( test_df['orgs'] )
- if ORG is not None:
- logging.info('Unfiltered test size: %d' % len(test_df))
- test_df = test_df[test_df['orgs'] == ORG]
- logging.info('Filtered test size: %d' % len(test_df))
-
- # Filter by type
- # org_df = pd.read_pickle('data/prokaryotes.pkl')
- # orgs = org_df['orgs']
- # test_df = test_df[test_df['orgs'].isin(orgs)]
-
- def reshape(values):
- values = np.hstack(values).reshape(
- len(values), len(values[0]))
- return values
-
- def normalize_minmax(values):
- mn = np.min(values)
- mx = np.max(values)
- if mx - mn != 0.0:
- return (values - mn) / (mx - mn)
- return values - mn
-
- def get_values(data_frame):
- print(data_frame['labels'].values.shape)
- labels = reshape(data_frame['labels'].values)
- ngrams = sequence.pad_sequences(
- data_frame['ngrams'].values, maxlen=MAXLEN)
- ngrams = reshape(ngrams)
- rep = reshape(data_frame['embeddings'].values)
- data = ngrams
- return data, labels
-
- train = get_values(train_df)
- valid = get_values(valid_df)
- test = get_values(test_df)
-
- return train, valid, test, train_df, valid_df, test_df
-
-
- def get_feature_model(params):
- embedding_dims = params['embedding_dims']
- max_features = 8001
- model = Sequential()
- model.add(Embedding(
- max_features,
- embedding_dims,
- input_length=MAXLEN))
- model.add(SpatialDropout1D(0.4))
- for i in range(params['nb_conv']):
- model.add(Conv1D(
- activation="relu",
- padding="valid",
- strides=1,
- filters=params['nb_filter'],
- kernel_size=params['filter_length']))
- model.add(MaxPooling1D(strides=params['stride'], pool_size=params['pool_length']))
- model.add(Flatten())
- return model
-
-
- def merge_outputs(outputs, name):
- if len(outputs) == 1:
- return outputs[0]
- ## return merge(outputs, mode='concat', name=name, concat_axis=1)
- return Concatenate(axis=1, name=name)(outputs)
-
-
- def merge_nets(nets, name):
- if len(nets) == 1:
- return nets[0]
- ## return merge(nets, mode='sum', name=name)
- return Add(name=name)(nets)
-
- def get_node_name(go_id, unique=False):
- name = go_id.split(':')[1]
- if not unique:
- return name
- if name not in node_names:
- node_names.add(name)
- return name
- i = 1
- while (name + '_' + str(i)) in node_names:
- i += 1
- name = name + '_' + str(i)
- node_names.add(name)
- return name
-
-
- def get_function_node(name, inputs):
- output_name = name + '_out'
- # net = Dense(256, name=name, activation='relu')(inputs)
- output = Dense(1, name=output_name, activation='sigmoid')(inputs)
- return output, output
-
-
- def get_generator(params, n_classes):
- inputs = Input(shape=(MAXLEN,), dtype='float32', name='input1')
- #feature_model = get_feature_model(params)(inputs)
- net0 = Dense(150, activation='relu')(inputs)
- net0 = Dense(150, activation='relu')(net0)
- #net0 = Dense(50, activation='relu')(net0)
- net = Dense(70, activation = 'relu')(net0)
- output = Dense(n_classes, activation='sigmoid')(net)
- model = Model(inputs=inputs, outputs=output)
-
- return model
-
-
- def get_discriminator(params, n_classes, dropout_rate=0.5):
- inputs = Input(shape=(n_classes, ))
- inputs2 = Input(shape =(MAXLEN,), dtype ='int32', name='d_input2')
- x2 = Embedding(8001,128, input_length=MAXLEN)(inputs2)
- x2 = Conv1D(filters =1 , kernel_size= 1, padding = 'valid', activation ='relu', strides=1)(x2)
- x2 = Lambda(lambda x: K.squeeze(x, 2))(x2)
-
- #for i in range(params['nb_conv']):
- # x2 = Conv1D ( activation="relu", padding="valid", strides=1, filters=params['nb_filter'],kernel_size=params['filter_length'])(x2)
-
-
- #x2 =MaxPooling1D(strides=params['stride'], pool_size=params['pool_length'])(x2)
- #x2 = Flatten()(x2)
-
- size = 40
- x = inputs
- x = Dropout(dropout_rate)(x)
- x = Dense(size)(x)
- x = BatchNormalization()(x)
- x = Activation('relu')(x)
-
-
- size = 40
- x2 = Dropout(dropout_rate)(x2)
- x2 = Dense(size)(x2)
- x2 = BatchNormalization()(x2)
- x2 = Activation('relu')(x2)
-
-
- x = Concatenate(axis =1 , name = 'merged2')([x, x2])
- layer_sizes = [80, 40,30]
- for size in layer_sizes:
- x = Dropout(dropout_rate)(x)
- x = Dense(size)(x)
- x = BatchNormalization()(x)
- x = Activation('relu')(x)
-
-
- outputs = Dense(1)(x)
- model = Model(inputs = [inputs ,inputs2], outputs=outputs, name='Discriminator')
-
- return model
-
-
-
- def get_model(params,nb_classes, batch_size, GRADIENT_PENALTY_WEIGHT=10):
- generator = get_generator(params, nb_classes)
- discriminator = get_discriminator(params, nb_classes)
-
- generator_model, discriminator_model = \
- WGAN_wrapper(generator=generator,
- discriminator=discriminator,
- generator_input_shape=(MAXLEN,),
- discriminator_input_shape=(nb_classes,),
- discriminator_input_shape2 = (MAXLEN, ),
- batch_size=batch_size,
- gradient_penalty_weight=GRADIENT_PENALTY_WEIGHT)
-
- logging.info('Compilation finished')
- return generator_model, discriminator_model
-
-
- def train_wgan(generator_model, discriminator_model, batch_size, epochs,
- x_train, y_train, x_val, y_val, generator_model_path, discriminator_model_path,
- TRAINING_RATIO=10, N_WARM_UP=0):
- BATCH_SIZE = batch_size
- N_EPOCH = epochs
-
-
- positive_y = np.ones((batch_size, 1), dtype=np.float32)
- zero_y = positive_y * 0
- negative_y = -positive_y
- positive_full_y = np.ones((BATCH_SIZE * TRAINING_RATIO, 1), dtype=np.float32)
- dummy_y = np.zeros((BATCH_SIZE, 1), dtype=np.float32)
-
- positive_full_enable_train = np.ones((len(x_train), 1), dtype = np.float32 )
- positive_full_enable_val = np.ones((len(x_val), 1), dtype =np.float32 )
- #positive_enable_train = np.ones((1, batch_size),dtype = np.float32 )
- #positive_full_train_enable = np.ones((1,BATCH_SIZE * TRAINING_RATIO ), dtype=np.float32 )
- best_validation_loss = None
-
- for epoch in range(N_EPOCH):
- # np.random.shuffle(X_train)
- print("Epoch: ", epoch)
- print("Number of batches: ", int(y_train.shape[0] // BATCH_SIZE))
- discriminator_loss = []
- generator_loss = []
- minibatches_size = BATCH_SIZE * TRAINING_RATIO
-
- shuffled_indexes = np.random.permutation(x_train.shape[0])
- shuffled_indexes_2 = np.random.permutation(x_train.shape[0])
-
- for i in range(int(y_train.shape[0] // (BATCH_SIZE * TRAINING_RATIO))):
- batch_indexes = shuffled_indexes[i * minibatches_size:(i + 1) * minibatches_size]
- batch_indexes_2 = shuffled_indexes_2[i * minibatches_size:(i + 1) * minibatches_size]
- x = x_train[batch_indexes]
- y = y_train[batch_indexes]
- y_2 = y_train[batch_indexes_2]
- x_2 = x_train[batch_indexes_2]
- if epoch < N_WARM_UP:
- for j in range(TRAINING_RATIO):
- x_batch = x[j * BATCH_SIZE:(j + 1) * BATCH_SIZE]
- y_batch = y[j * BATCH_SIZE:(j + 1) * BATCH_SIZE]
-
- generator_loss.append(generator_model.train_on_batch([x_batch, positive_y], [y_batch, zero_y]))
- else:
- for j in range(TRAINING_RATIO):
- x_batch = x[j * BATCH_SIZE:(j + 1) * BATCH_SIZE]
- y_batch_2 = y_2[j * BATCH_SIZE:(j + 1) * BATCH_SIZE]
- x_batch_2 = x_2[j * BATCH_SIZE:(j + 1) * BATCH_SIZE]
- # noise = np.random.rand(BATCH_SIZE, 100).astype(np.float32)
- noise = x_batch
- #print(sum(y_batch_2))
- discriminator_loss.append(discriminator_model.train_on_batch(
- [y_batch_2, noise, x_batch_2 ],
- [positive_y, negative_y, dummy_y]))
-
- generator_loss.append(generator_model.train_on_batch([x,positive_full_y], [y, positive_full_y]))
- # Still needs some code to display losses from the generator and discriminator, progress bars, etc.
- predicted_y_train, _ = generator_model.predict([x_train , positive_full_enable_train], batch_size=BATCH_SIZE)
- predicted_y_val, _ = generator_model.predict([ x_val , positive_full_enable_val ], batch_size=BATCH_SIZE)
-
- #print(sum(sum(positive_full_enable_train)))
- #print(predicted_y_train)
- train_loss = log_loss(y_train, predicted_y_train)
- val_loss = log_loss(y_val, predicted_y_val)
-
- print("train loss: {:.4f}, validation loss: {:.4f}, discriminator loss: {:.4f}".format(
- train_loss, val_loss,
- (np.sum(np.asarray(discriminator_loss)) if discriminator_loss else -1) / x_train.shape[0]))
-
- if best_validation_loss is None or best_validation_loss > val_loss:
- print('\nEpoch %05d: improved from %0.5f,'
- ' saving model to %s and %s'
- % (epoch + 1, val_loss, generator_model_path, discriminator_model_path))
-
- best_validation_loss = val_loss
- generator_model.save(generator_model_path, overwrite=True)
- discriminator_model.save(discriminator_model_path, overwrite=True)
-
-
- def model(params, batch_size=20, nb_epoch=40, is_train=True):
- # set parameters:
- #nb_classes = len(functions)
- start_time = time.time()
- logging.info("Loading Data")
-
- ##
- #train, val, test, train_df, valid_df, test_df = load_data()
- #train_df = pd.concat([train_df, valid_df])
- #test_gos = test_df['gos'].values
- #train_data, train_labels = train
- #val_data, val_labels = val
- #test_data, test_labels = test
- ##
-
- train_data, train_labels, test_data, test_labels, val_data, val_labels = load_data2()
- nb_classes = train_labels.shape[1]
-
- logging.info("Data loaded in %d sec" % (time.time() - start_time))
- logging.info("Training data size: %d" % len(train_data))
- logging.info("Validation data size: %d" % len(val_data))
- logging.info("Test data size: %d" % len(test_data))
- generator_model_path = DATA_ROOT + 'models/new_model_seq_' + FUNCTION + '.h5'
- discriminator_model_path = DATA_ROOT + 'models/new_model_disc_seq_' + FUNCTION + '.h5'
-
-
- logging.info('Starting training the model')
-
- train_generator = DataGenerator(batch_size, nb_classes)
- train_generator.fit(train_data, train_labels)
- valid_generator = DataGenerator(batch_size, nb_classes)
- valid_generator.fit(val_data, val_labels)
- test_generator = DataGenerator(batch_size, nb_classes)
- test_generator.fit(test_data, test_labels)
-
- if is_train:
- generator_model, discriminator_model = get_model(params, nb_classes, batch_size)
- train_wgan(generator_model, discriminator_model, batch_size=batch_size, epochs=nb_epoch,
- x_train=train_data, y_train=train_labels, x_val=val_data, y_val=val_labels,
- generator_model_path=generator_model_path,
- discriminator_model_path=discriminator_model_path)
-
- logging.info('Loading best model')
- model = load_model(generator_model_path,
- custom_objects={'generator_recunstruction_loss_new': generator_recunstruction_loss_new,
- 'wasserstein_loss': wasserstein_loss})
-
- logging.info('Predicting')
- preds = model.predict_generator(test_generator, steps=len(test_data) / batch_size)[0]
-
- # incon = 0
- # for i in xrange(len(test_data)):
- # for j in xrange(len(functions)):
- # childs = set(go[functions[j]]['children']).intersection(func_set)
- # ok = True`
- # for n_id in childs:
- # if preds[i, j] < preds[i, go_indexes[n_id]]:
- # preds[i, j] = preds[i, go_indexes[n_id]]
- # ok = False
- # if not ok:
- # incon += 1
- logging.info('Computing performance')
- f, p, r, t, preds_max = compute_performance(preds, test_labels) #, test_gos)
- roc_auc = compute_roc(preds, test_labels)
- mcc = compute_mcc(preds_max, test_labels)
- aupr , _ = compute_aupr(preds, test_labels)
- m_pr_max, m_rc_max, m_f1_max, M_pr_max, M_rc_max, M_f1_max = micro_macro_function_centric_f1(preds.T, test_labels.T)
-
- logging.info('Protein centric macro Th, PR, RC, F1: \t %f %f %f %f' % (t, p, r, f))
- logging.info('ROC AUC: \t %f ' % (roc_auc, ))
- logging.info('MCC: \t %f ' % (mcc, ))
- logging.info('AUPR: \t %f ' % (aupr, ))
- logging.info('Function centric macro PR, RC, F1: \t %f %f %f' % (M_pr_max, M_rc_max, M_f1_max) )
- logging.info('Function centric micro PR, RC, F1: \t %f %f %f' % (m_pr_max, m_rc_max, m_f1_max) )
-
- function_centric_performance(functions, preds.T, test_labels.T, train_labels.T)
-
-
-
- def load_prot_ipro():
- proteins = list()
- ipros = list()
- with open(DATA_ROOT + 'swissprot_ipro.tab') as f:
- for line in f:
- it = line.strip().split('\t')
- if len(it) != 3:
- continue
- prot = it[1]
- iprs = it[2].split(';')
- proteins.append(prot)
- ipros.append(iprs)
- return pd.DataFrame({'proteins': proteins, 'ipros': ipros})
-
-
- def performanc_by_interpro():
- pred_df = pd.read_pickle(DATA_ROOT + 'test-' + FUNCTION + '-preds.pkl')
- ipro_df = load_prot_ipro()
- df = pred_df.merge(ipro_df, on='proteins', how='left')
- ipro = get_ipro()
-
- def reshape(values):
- values = np.hstack(values).reshape(
- len(values), len(values[0]))
- return values
-
- for ipro_id in ipro:
- if len(ipro[ipro_id]['parents']) > 0:
- continue
- labels = list()
- predictions = list()
- gos = list()
- for i, row in df.iterrows():
- if not isinstance(row['ipros'], list):
- continue
- if ipro_id in row['ipros']:
- labels.append(row['labels'])
- predictions.append(row['predictions'])
- gos.append(row['gos'])
- pr = 0
- rc = 0
- total = 0
- p_total = 0
- for i in range(len(labels)):
- tp = np.sum(labels[i] * predictions[i])
- fp = np.sum(predictions[i]) - tp
- fn = np.sum(labels[i]) - tp
- all_gos = set()
- for go_id in gos[i]:
- if go_id in all_functions:
- all_gos |= get_anchestors(go, go_id)
- all_gos.discard(GO_ID)
- all_gos -= func_set
- fn += len(all_gos)
- if tp == 0 and fp == 0 and fn == 0:
- continue
- total += 1
- if tp != 0:
- p_total += 1
- precision = tp / (1.0 * (tp + fp))
- recall = tp / (1.0 * (tp + fn))
- pr += precision
- rc += recall
- if total > 0 and p_total > 0:
- rc /= total
- pr /= p_total
- if pr + rc > 0:
- f = 2 * pr * rc / (pr + rc)
- logging.info('%s\t%d\t%f\t%f\t%f' % (
- ipro_id, len(labels), f, pr, rc))
-
-
- def function_centric_performance(functions, preds, labels, labels_train):
- results = []
- preds = np.round(preds, 2)
- for i in range(preds.shape[0]):
- f_max = 0
- p_max = 0
- r_max = 0
- for t in range(1, 100):
- threshold = t / 100.0
- predictions = (preds[i, :] > threshold).astype(np.int32)
- tp = np.sum(predictions * labels[i, :])
- fp = np.sum(predictions) - tp
- fn = np.sum(labels[i, :]) - tp
- if tp > 0:
- precision = tp / (1.0 * (tp + fp))
- recall = tp / (1.0 * (tp + fn))
- f = 2 * precision * recall / (precision + recall)
- else:
- if fp == 0 and fn == 0:
- precision = 1
- recall = 1
- f = 1
- else:
- precision = 0
- recall = 0
- f = 0
-
- if f_max < f:
- f_max = f
- p_max = precision
- r_max = recall
- num_prots_train = np.sum(labels_train[i, :])
- height = get_height(go, functions[i])
- results.append([functions[i], num_prots_train, height, f_max, p_max, r_max])
- results = pd.DataFrame(results)
- results.to_csv('Con_GodGanSeq_results_' + FUNCTION + '.txt', sep='\t', index=False)
-
-
-
- def function_centric_performance_backup(functions, preds, labels, labels_train):
- results = []
- preds = np.round(preds, 2)
- for i in range(len(functions)):
- f_max = 0
- p_max = 0
- r_max = 0
- x = list()
- y = list()
- total = 0
- for t in range(1, 100):
- threshold = t / 100.0
- predictions = (preds[i, :] > threshold).astype(np.int32)
- tp = np.sum(predictions * labels[i, :])
- fp = np.sum(predictions) - tp
- fn = np.sum(labels[i, :]) - tp
- if tp >0:
- sn = tp / (1.0 * np.sum(labels[i, :]))
- sp = np.sum((predictions ^ 1) * (labels[i, :] ^ 1))
- sp /= 1.0 * np.sum(labels[i, :] ^ 1)
- fpr = 1 - sp
- x.append(fpr)
- y.append(sn)
- precision = tp / (1.0 * (tp + fp))
- recall = tp / (1.0 * (tp + fn))
- f = 2 * precision * recall / (precision + recall)
- total +=1
- if f_max < f:
- f_max = f
- p_max = precision
- r_max = recall
- num_prots = np.sum(labels[i, :])
- num_prots_train = np.sum(labels_train[i,:])
- if total >1 :
- roc_auc = auc(x, y)
- else:
- roc_auc =0
- height = get_height(go , functions[i])
- results.append([functions[i], f_max, p_max, r_max, num_prots, num_prots_train, height,roc_auc])
- results = pd.DataFrame(results)
- #results.to_csv('new_results.txt' , sep='\t' , index = False)
- results.to_csv('Con_GodGanSeq_results_'+FUNCTION +'.txt', sep='\t', index=False)
- #results = np.array(results)
- #p_mean = (np.sum(results[:,2])) / len(functions)
- #r_mean = (np.sum(results[:,3])) / len(functions)
- #f_mean = (2*p_mean*r_mean)/(p_mean+r_mean)
- #roc_auc_mean = (np.sum(results[:,7])) / len(functions)
- #print('Function centric performance (macro) ' '%f %f %f %f' % (f_mean, p_mean, r_mean, roc_auc_mean))
-
-
-
- def micro_macro_function_centric_f1_backup(preds, labels):
- preds = np.round(preds, 2)
- m_f1_max = 0
- M_f1_max = 0
- for t in range(1, 100):
- threshold = t / 100.0
- predictions = (preds > threshold).astype(np.int32)
- m_tp = 0
- m_fp = 0
- m_fn = 0
- M_pr = 0
- M_rc = 0
- total = 0
- p_total = 0
- for i in range(len(preds)):
- tp = np.sum(predictions[i, :] * labels[i, :])
- fp = np.sum(predictions[i, :]) - tp
- fn = np.sum(labels[i, :]) - tp
- if tp == 0 and fp == 0 and fn == 0:
- continue
- total += 1
- if tp > 0:
- pr = tp / (1.0 * (tp + fp))
- rc = tp / (1.0 * (tp + fn))
- m_tp += tp
- m_fp += fp
- m_fn += fn
- M_pr += pr
- M_rc += rc
- p_total += 1
-
- if p_total == 0:
- continue
- if total > 0:
- m_tp /= total
- m_fn /= total
- m_fp /= total
- m_pr = m_tp / (1.0 * (m_tp + m_fp))
- m_rc = m_tp / (1.0 * (m_tp + m_fn))
- M_pr /= p_total
- M_rc /= total
- m_f1 = 2 * m_pr * m_rc / (m_pr + m_rc)
- M_f1 = 2 * M_pr * M_rc / (M_pr + M_rc)
-
- if m_f1 > m_f1_max:
- m_f1_max = m_f1
- m_pr_max = m_pr
- m_rc_max = m_rc
-
- if M_f1 > M_f1_max:
- M_f1_max = M_f1
- M_pr_max = M_pr
- M_rc_max = M_rc
-
- return m_pr_max, m_rc_max, m_f1_max, M_pr_max, M_rc_max, M_f1_max
-
-
-
-
-
- def micro_macro_function_centric_f1(preds, labels):
- preds = np.round(preds, 2)
- m_f1_max = 0
- M_f1_max = 0
- for t in range(1, 200):
- threshold = t / 200.0
- predictions = (preds > threshold).astype(np.int32)
- m_tp = 0
- m_fp = 0
- m_fn = 0
- M_pr = 0
- M_rc = 0
- for i in range(preds.shape[0]):
- tp = np.sum(predictions[i, :] * labels[i, :])
- fp = np.sum(predictions[i, :]) - tp
- fn = np.sum(labels[i, :]) - tp
- m_tp += tp
- m_fp += fp
- m_fn += fn
- if tp > 0:
- pr = 1.0 * tp / (1.0 * (tp + fp))
- rc = 1.0 * tp / (1.0 * (tp + fn))
- else:
- if fp == 0 and fn == 0:
- pr = 1
- rc = 1
- else:
- pr = 0
- rc = 0
- M_pr += pr
- M_rc += rc
-
- if m_tp > 0:
- m_pr = 1.0 * m_tp / (1.0 * (m_tp + m_fp))
- m_rc = 1.0 * m_tp / (1.0 * (m_tp + m_fn))
- m_f1 = 2.0 * m_pr * m_rc / (m_pr + m_rc)
- else:
- if m_fp == 0 and m_fn == 0:
- m_pr = 1
- m_rc = 1
- m_f1 = 1
- else:
- m_pr = 0
- m_rc = 0
- m_f1 = 0
-
- M_pr /= preds.shape[0]
- M_rc /= preds.shape[0]
- if M_pr == 0 and M_rc == 0:
- M_f1 = 0
- else:
- M_f1 = 2.0 * M_pr * M_rc / (M_pr + M_rc)
- if m_f1 > m_f1_max:
- m_f1_max = m_f1
- m_pr_max = m_pr
- m_rc_max = m_rc
-
- if M_f1 > M_f1_max:
- M_f1_max = M_f1
- M_pr_max = M_pr
- M_rc_max = M_rc
-
- return m_pr_max, m_rc_max, m_f1_max, M_pr_max, M_rc_max, M_f1_max
-
-
- def compute_roc(preds, labels):
- # Compute ROC curve and ROC area for each class
- fpr, tpr, _ = roc_curve(labels.flatten(), preds.flatten())
- roc_auc = auc(fpr, tpr)
- return roc_auc
-
-
- def compute_aupr(preds, labels):
- # Compute ROC curve and ROC area for each class
- pr, rc, threshold =precision_recall_curve(labels.flatten(), preds.flatten())
- pr_auc = auc(rc, pr)
- #pr, rc, threshold =precision_recall_curve(labels.flatten(), preds.flatten(),average ='macro' )
- M_pr_auc = 0
- return pr_auc, M_pr_auc
-
-
-
- def compute_mcc(preds, labels):
- # Compute ROC curve and ROC area for each class
- mcc = matthews_corrcoef(labels.flatten(), preds.flatten())
- return mcc
-
-
- def compute_performance(preds, labels): #, gos):
- preds = np.round(preds, 2)
- f_max = 0
- p_max = 0
- r_max = 0
- t_max = 0
- for t in range(1, 100):
- threshold = t / 100.0
- predictions = (preds > threshold).astype(np.int32)
- total = 0
- f = 0.0
- p = 0.0
- r = 0.0
- p_total = 0
- for i in range(labels.shape[0]):
- tp = np.sum(predictions[i, :] * labels[i, :])
- fp = np.sum(predictions[i, :]) - tp
- fn = np.sum(labels[i, :]) - tp
- all_gos = set()
- #for go_id in gos[i]:
- # if go_id in all_functions:
- # all_gos |= get_anchestors(go, go_id)
- #all_gos.discard(GO_ID)
- #all_gos -= func_set
- #fn += len(all_gos)
- if tp == 0 and fp == 0 and fn == 0:
- continue
- total += 1
- if tp != 0:
- p_total += 1
- precision = tp / (1.0 * (tp + fp))
- recall = tp / (1.0 * (tp + fn))
- p += precision
- r += recall
- if p_total == 0:
- continue
- r /= total
- p /= p_total
- if p + r > 0:
- f = 2 * p * r / (p + r)
- if f_max < f:
- f_max = f
- p_max = p
- r_max = r
- t_max = threshold
- predictions_max = predictions
- return f_max, p_max, r_max, t_max, predictions_max
-
-
- def get_gos(pred):
- mdist = 1.0
- mgos = None
- for i in range(len(labels_gos)):
- labels, gos = labels_gos[i]
- dist = distance.cosine(pred, labels)
- if mdist > dist:
- mdist = dist
- mgos = gos
- return mgos
-
-
- def compute_similarity_performance(train_df, test_df, preds):
- logging.info("Computing similarity performance")
- logging.info("Training data size %d" % len(train_df))
- train_labels = train_df['labels'].values
- train_gos = train_df['gos'].values
- global labels_gos
- labels_gos = zip(train_labels, train_gos)
- p = Pool(64)
- pred_gos = p.map(get_gos, preds)
- total = 0
- p = 0.0
- r = 0.0
- f = 0.0
- test_gos = test_df['gos'].values
- for gos, tgos in zip(pred_gos, test_gos):
- preds = set()
- test = set()
- for go_id in gos:
- if go_id in all_functions:
- preds |= get_anchestors(go, go_id)
- for go_id in tgos:
- if go_id in all_functions:
- test |= get_anchestors(go, go_id)
- tp = len(preds.intersection(test))
- fp = len(preds - test)
- fn = len(test - preds)
- if tp == 0 and fp == 0 and fn == 0:
- continue
- total += 1
- if tp != 0:
- precision = tp / (1.0 * (tp + fp))
- recall = tp / (1.0 * (tp + fn))
- p += precision
- r += recall
- f += 2 * precision * recall / (precision + recall)
- return f / total, p / total, r / total
-
-
- def print_report(report, go_id):
- with open(DATA_ROOT + 'reports.txt', 'a') as f:
- f.write('Classification report for ' + go_id + '\n')
- f.write(report + '\n')
-
-
- if __name__ == '__main__':
- main()
|