Browse Source

Delete 'conditional_new_nn_hirearchical_seq_mt.py'

master
parent
commit
7498f1c80e
1 changed files with 0 additions and 749 deletions
  1. 0
    749
      conditional_new_nn_hirearchical_seq_mt.py

+ 0
- 749
conditional_new_nn_hirearchical_seq_mt.py View File

@@ -1,749 +0,0 @@
#!/usr/bin/env python

"""

"""
from __future__ import division

import logging
import sys
import time
from collections import deque
from multiprocessing import Pool

import click as ck
import numpy as np
import pandas as pd
import tensorflow as tf
from keras import backend as K
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import (
Dense, Input, SpatialDropout1D, Conv1D, MaxPooling1D,
Flatten, Concatenate, Add, Maximum, Embedding, BatchNormalization, Activation, Dropout)
from keras.losses import binary_crossentropy
from keras.models import Sequential, Model, load_model
from keras.preprocessing import sequence
from scipy.spatial import distance
from sklearn.metrics import log_loss
from sklearn.metrics import roc_curve, auc, matthews_corrcoef
from keras.layers import Lambda
from sklearn.metrics import precision_recall_curve

from utils import (
get_gene_ontology,
get_go_set,
get_anchestors,
get_parents,
DataGenerator,
FUNC_DICT,
get_height,
get_ipro)
from conditional_wgan_wrapper import WGAN_wrapper, wasserstein_loss, generator_recunstruction_loss_new

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
K.set_session(sess)

logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)
sys.setrecursionlimit(100000)

DATA_ROOT = 'data/swiss/'
MAXLEN = 258 #1000
REPLEN = 256
ind = 0


@ck.command()
@ck.option(
'--function',
default='bp',
help='Ontology id (mf, bp, cc)')
@ck.option(
'--device',
default='gpu:0',
help='GPU or CPU device id')
@ck.option(
'--org',
default= None,
help='Organism id for filtering test set')
@ck.option('--train',default = True, is_flag=True)
@ck.option('--param', default=0, help='Param index 0-7')
def main(function, device, org, train, param):
global FUNCTION
FUNCTION = function
global GO_ID
GO_ID = FUNC_DICT[FUNCTION]
global go
go = get_gene_ontology('go.obo')
global ORG
ORG = org
func_df = pd.read_pickle(DATA_ROOT + FUNCTION + '.pkl')
global functions
functions = func_df['functions'].values
global func_set
func_set = set(functions)
global all_functions
all_functions = get_go_set(go, GO_ID)
logging.info('Functions: %s %d' % (FUNCTION, len(functions)))
if ORG is not None:
logging.info('Organism %s' % ORG)
global go_indexes
go_indexes = dict()
for ind, go_id in enumerate(functions):
go_indexes[go_id] = ind
global node_names
node_names = set()
with tf.device('/' + device):
params = {
'fc_output': 1024,
'learning_rate': 0.001,
'embedding_dims': 128,
'embedding_dropout': 0.2,
'nb_conv': 1,
'nb_dense': 1,
'filter_length': 128,
'nb_filter': 32,
'pool_length': 64,
'stride': 32
}
model(params, is_train=train)



def load_data2():
all_data_x_fn = 'data2/all_data_X.csv'
all_data_x = pd.read_csv(all_data_x_fn, sep='\t', header=0, index_col=0)
all_proteins_train = [p.replace('"', '') for p in all_data_x.index]
all_data_x.index = all_proteins_train
all_data_y_fn = 'data2/all_data_Y.csv'
all_data_y = pd.read_csv(all_data_y_fn, sep='\t', header=0, index_col=0)
branch = pd.read_csv('data2/'+FUNCTION +'_branches.txt', sep='\t', header=0, index_col=0)
all_x = all_data_x.values
branches = [p for p in branch.index.tolist() if p in all_data_y.columns.tolist()]
t= pd.DataFrame(all_data_y, columns=branches)
all_y = t.values
number_of_test = int(np.ceil(0.2 * len(all_x)))
index = np.random.rand(1,number_of_test)
index_test = [int(p) for p in np.ceil(index*len(all_x))[0] ]
index_train = [p for p in range(len(all_x)) if p not in index_test]
train_data = all_x[index_train, : ] #[ :20000, : ]
test_data = all_x[index_test, : ] #[20000: , : ]
train_labels = all_y[index_train, : ] #[ :20000, : ]
test_labels = all_y[index_test, :] #[20000: , : ]
val_data = test_data
val_labels = test_labels
#print(sum(sum(train_labels)))
#print(train_data.shape)
print(train_labels.shape)
print(test_labels.shape)
return train_data, train_labels, test_data, test_labels, val_data, val_labels




def load_data():

df = pd.read_pickle(DATA_ROOT + 'train' + '-' + FUNCTION + '.pkl')
n = len(df)
index = df.index.values
valid_n = int(n * 0.8)
train_df = df.loc[index[:valid_n]]
valid_df = df.loc[index[valid_n:]]
test_df = pd.read_pickle(DATA_ROOT + 'test' + '-' + FUNCTION + '.pkl')
print( test_df['orgs'] )
if ORG is not None:
logging.info('Unfiltered test size: %d' % len(test_df))
test_df = test_df[test_df['orgs'] == ORG]
logging.info('Filtered test size: %d' % len(test_df))

# Filter by type
# org_df = pd.read_pickle('data/prokaryotes.pkl')
# orgs = org_df['orgs']
# test_df = test_df[test_df['orgs'].isin(orgs)]

def reshape(values):
values = np.hstack(values).reshape(
len(values), len(values[0]))
return values

def normalize_minmax(values):
mn = np.min(values)
mx = np.max(values)
if mx - mn != 0.0:
return (values - mn) / (mx - mn)
return values - mn

def get_values(data_frame):
print(data_frame['labels'].values.shape)
labels = reshape(data_frame['labels'].values)
ngrams = sequence.pad_sequences(
data_frame['ngrams'].values, maxlen=MAXLEN)
ngrams = reshape(ngrams)
rep = reshape(data_frame['embeddings'].values)
data = ngrams
return data, labels

train = get_values(train_df)
valid = get_values(valid_df)
test = get_values(test_df)

return train, valid, test, train_df, valid_df, test_df


def get_feature_model(params):
embedding_dims = params['embedding_dims']
max_features = 8001
model = Sequential()
model.add(Embedding(
max_features,
embedding_dims,
input_length=MAXLEN))
model.add(SpatialDropout1D(0.4))
for i in range(params['nb_conv']):
model.add(Conv1D(
activation="relu",
padding="valid",
strides=1,
filters=params['nb_filter'],
kernel_size=params['filter_length']))
model.add(MaxPooling1D(strides=params['stride'], pool_size=params['pool_length']))
model.add(Flatten())
return model


def get_generator(params, n_classes):
inputs = Input(shape=(MAXLEN,), dtype='float32', name='input1')
#feature_model = get_feature_model(params)(inputs)
net0 = Dense(150, activation='relu')(inputs)
net0 = Dense(150, activation='relu')(net0)
#net0 = Dense(50, activation='relu')(net0)
net = Dense(70, activation = 'relu')(net0)
output = Dense(n_classes, activation='sigmoid')(net)
model = Model(inputs=inputs, outputs=output)

return model


def get_discriminator(params, n_classes, dropout_rate=0.5):
inputs = Input(shape=(n_classes, ))
inputs2 = Input(shape =(MAXLEN,), dtype ='int32', name='d_input2')
x2 = Embedding(8001,128, input_length=MAXLEN)(inputs2)
x2 = Conv1D(filters =1 , kernel_size= 1, padding = 'valid', activation ='relu', strides=1)(x2)
x2 = Lambda(lambda x: K.squeeze(x, 2))(x2)

#for i in range(params['nb_conv']):
# x2 = Conv1D ( activation="relu", padding="valid", strides=1, filters=params['nb_filter'],kernel_size=params['filter_length'])(x2)

#x2 =MaxPooling1D(strides=params['stride'], pool_size=params['pool_length'])(x2)
#x2 = Flatten()(x2)
size = 40
x = inputs
x = Dropout(dropout_rate)(x)
x = Dense(size)(x)
x = BatchNormalization()(x)
x = Activation('relu')(x)


size = 40
x2 = Dropout(dropout_rate)(x2)
x2 = Dense(size)(x2)
x2 = BatchNormalization()(x2)
x2 = Activation('relu')(x2)


x = Concatenate(axis =1 , name = 'merged2')([x, x2])
layer_sizes = [80, 40,30]
for size in layer_sizes:
x = Dropout(dropout_rate)(x)
x = Dense(size)(x)
x = BatchNormalization()(x)
x = Activation('relu')(x)


outputs = Dense(1)(x)
model = Model(inputs = [inputs ,inputs2], outputs=outputs, name='Discriminator')

return model



def get_model(params,nb_classes, batch_size, GRADIENT_PENALTY_WEIGHT=10):
generator = get_generator(params, nb_classes)
discriminator = get_discriminator(params, nb_classes)

generator_model, discriminator_model = \
WGAN_wrapper(generator=generator,
discriminator=discriminator,
generator_input_shape=(MAXLEN,),
discriminator_input_shape=(nb_classes,),
discriminator_input_shape2 = (MAXLEN, ),
batch_size=batch_size,
gradient_penalty_weight=GRADIENT_PENALTY_WEIGHT)

logging.info('Compilation finished')
return generator_model, discriminator_model


def train_wgan(generator_model, discriminator_model, batch_size, epochs,
x_train, y_train, x_val, y_val, generator_model_path, discriminator_model_path,
TRAINING_RATIO=10, N_WARM_UP=0):
BATCH_SIZE = batch_size
N_EPOCH = epochs
positive_y = np.ones((batch_size, 1), dtype=np.float32)
zero_y = positive_y * 0
negative_y = -positive_y
positive_full_y = np.ones((BATCH_SIZE * TRAINING_RATIO, 1), dtype=np.float32)
dummy_y = np.zeros((BATCH_SIZE, 1), dtype=np.float32)
positive_full_enable_train = np.ones((len(x_train), 1), dtype = np.float32 )
positive_full_enable_val = np.ones((len(x_val), 1), dtype =np.float32 )
#positive_enable_train = np.ones((1, batch_size),dtype = np.float32 )
#positive_full_train_enable = np.ones((1,BATCH_SIZE * TRAINING_RATIO ), dtype=np.float32 )
best_validation_loss = None

for epoch in range(N_EPOCH):
# np.random.shuffle(X_train)
print("Epoch: ", epoch)
print("Number of batches: ", int(y_train.shape[0] // BATCH_SIZE))
discriminator_loss = []
generator_loss = []
minibatches_size = BATCH_SIZE * TRAINING_RATIO

shuffled_indexes = np.random.permutation(x_train.shape[0])
shuffled_indexes_2 = np.random.permutation(x_train.shape[0])

for i in range(int(y_train.shape[0] // (BATCH_SIZE * TRAINING_RATIO))):
batch_indexes = shuffled_indexes[i * minibatches_size:(i + 1) * minibatches_size]
batch_indexes_2 = shuffled_indexes_2[i * minibatches_size:(i + 1) * minibatches_size]
x = x_train[batch_indexes]
y = y_train[batch_indexes]
y_2 = y_train[batch_indexes_2]
x_2 = x_train[batch_indexes_2]
if epoch < N_WARM_UP:
for j in range(TRAINING_RATIO):
x_batch = x[j * BATCH_SIZE:(j + 1) * BATCH_SIZE]
y_batch = y[j * BATCH_SIZE:(j + 1) * BATCH_SIZE]

generator_loss.append(generator_model.train_on_batch([x_batch, positive_y], [y_batch, zero_y]))
else:
for j in range(TRAINING_RATIO):
x_batch = x[j * BATCH_SIZE:(j + 1) * BATCH_SIZE]
y_batch_2 = y_2[j * BATCH_SIZE:(j + 1) * BATCH_SIZE]
x_batch_2 = x_2[j * BATCH_SIZE:(j + 1) * BATCH_SIZE]
# noise = np.random.rand(BATCH_SIZE, 100).astype(np.float32)
noise = x_batch
#print(sum(y_batch_2))
discriminator_loss.append(discriminator_model.train_on_batch(
[y_batch_2, noise, x_batch_2 ],
[positive_y, negative_y, dummy_y]))

generator_loss.append(generator_model.train_on_batch([x,positive_full_y], [y, positive_full_y]))
# Still needs some code to display losses from the generator and discriminator, progress bars, etc.
predicted_y_train, _ = generator_model.predict([x_train , positive_full_enable_train], batch_size=BATCH_SIZE)
predicted_y_val, _ = generator_model.predict([ x_val , positive_full_enable_val ], batch_size=BATCH_SIZE)
#print(sum(sum(positive_full_enable_train)))
#print(predicted_y_train)
train_loss = log_loss(y_train, predicted_y_train)
val_loss = log_loss(y_val, predicted_y_val)

print("train loss: {:.4f}, validation loss: {:.4f}, discriminator loss: {:.4f}".format(
train_loss, val_loss,
(np.sum(np.asarray(discriminator_loss)) if discriminator_loss else -1) / x_train.shape[0]))

if best_validation_loss is None or best_validation_loss > val_loss:
print('\nEpoch %05d: improved from %0.5f,'
' saving model to %s and %s'
% (epoch + 1, val_loss, generator_model_path, discriminator_model_path))

best_validation_loss = val_loss
generator_model.save(generator_model_path, overwrite=True)
discriminator_model.save(discriminator_model_path, overwrite=True)


def model(params, batch_size=20, nb_epoch=40, is_train=True):
# set parameters:
#nb_classes = len(functions)
start_time = time.time()
logging.info("Loading Data")
##
#train, val, test, train_df, valid_df, test_df = load_data()
#train_df = pd.concat([train_df, valid_df])
#test_gos = test_df['gos'].values
#train_data, train_labels = train
#val_data, val_labels = val
#test_data, test_labels = test
##

train_data, train_labels, test_data, test_labels, val_data, val_labels = load_data2()
nb_classes = train_labels.shape[1]

logging.info("Data loaded in %d sec" % (time.time() - start_time))
logging.info("Training data size: %d" % len(train_data))
logging.info("Validation data size: %d" % len(val_data))
logging.info("Test data size: %d" % len(test_data))
generator_model_path = DATA_ROOT + 'models/new_model_seq_' + FUNCTION + '.h5'
discriminator_model_path = DATA_ROOT + 'models/new_model_disc_seq_' + FUNCTION + '.h5'


logging.info('Starting training the model')

train_generator = DataGenerator(batch_size, nb_classes)
train_generator.fit(train_data, train_labels)
valid_generator = DataGenerator(batch_size, nb_classes)
valid_generator.fit(val_data, val_labels)
test_generator = DataGenerator(batch_size, nb_classes)
test_generator.fit(test_data, test_labels)

if is_train:
generator_model, discriminator_model = get_model(params, nb_classes, batch_size)
train_wgan(generator_model, discriminator_model, batch_size=batch_size, epochs=nb_epoch,
x_train=train_data, y_train=train_labels, x_val=val_data, y_val=val_labels,
generator_model_path=generator_model_path,
discriminator_model_path=discriminator_model_path)

logging.info('Loading best model')
model = load_model(generator_model_path,
custom_objects={'generator_recunstruction_loss_new': generator_recunstruction_loss_new,
'wasserstein_loss': wasserstein_loss})

logging.info('Predicting')
preds = model.predict_generator(test_generator, steps=len(test_data) / batch_size)[0]
# incon = 0
# for i in xrange(len(test_data)):
# for j in xrange(len(functions)):
# childs = set(go[functions[j]]['children']).intersection(func_set)
# ok = True`
# for n_id in childs:
# if preds[i, j] < preds[i, go_indexes[n_id]]:
# preds[i, j] = preds[i, go_indexes[n_id]]
# ok = False
# if not ok:
# incon += 1
logging.info('Computing performance')
f, p, r, t, preds_max = compute_performance(preds, test_labels) #, test_gos)
roc_auc = compute_roc(preds, test_labels)
mcc = compute_mcc(preds_max, test_labels)
aupr , _ = compute_aupr(preds, test_labels)
m_pr_max, m_rc_max, m_f1_max, M_pr_max, M_rc_max, M_f1_max = micro_macro_function_centric_f1(preds.T, test_labels.T)

logging.info('Protein centric macro Th, PR, RC, F1: \t %f %f %f %f' % (t, p, r, f))
logging.info('ROC AUC: \t %f ' % (roc_auc, ))
logging.info('MCC: \t %f ' % (mcc, ))
logging.info('AUPR: \t %f ' % (aupr, ))
logging.info('Function centric macro PR, RC, F1: \t %f %f %f' % (M_pr_max, M_rc_max, M_f1_max) )
logging.info('Function centric micro PR, RC, F1: \t %f %f %f' % (m_pr_max, m_rc_max, m_f1_max) )

function_centric_performance(functions, preds.T, test_labels.T, train_labels.T)



def function_centric_performance(functions, preds, labels, labels_train):
results = []
preds = np.round(preds, 2)
for i in range(preds.shape[0]):
f_max = 0
p_max = 0
r_max = 0
for t in range(1, 100):
threshold = t / 100.0
predictions = (preds[i, :] > threshold).astype(np.int32)
tp = np.sum(predictions * labels[i, :])
fp = np.sum(predictions) - tp
fn = np.sum(labels[i, :]) - tp
if tp > 0:
precision = tp / (1.0 * (tp + fp))
recall = tp / (1.0 * (tp + fn))
f = 2 * precision * recall / (precision + recall)
else:
if fp == 0 and fn == 0:
precision = 1
recall = 1
f = 1
else:
precision = 0
recall = 0
f = 0

if f_max < f:
f_max = f
p_max = precision
r_max = recall
num_prots_train = np.sum(labels_train[i, :])
height = get_height(go, functions[i])
results.append([functions[i], num_prots_train, height, f_max, p_max, r_max])
results = pd.DataFrame(results)
results.to_csv('Con_GodGanSeq_results_' + FUNCTION + '.txt', sep='\t', index=False)



def function_centric_performance_backup(functions, preds, labels, labels_train):
results = []
preds = np.round(preds, 2)
for i in range(len(functions)):
f_max = 0
p_max = 0
r_max = 0
x = list()
y = list()
total = 0
for t in range(1, 100):
threshold = t / 100.0
predictions = (preds[i, :] > threshold).astype(np.int32)
tp = np.sum(predictions * labels[i, :])
fp = np.sum(predictions) - tp
fn = np.sum(labels[i, :]) - tp
if tp >0:
sn = tp / (1.0 * np.sum(labels[i, :]))
sp = np.sum((predictions ^ 1) * (labels[i, :] ^ 1))
sp /= 1.0 * np.sum(labels[i, :] ^ 1)
fpr = 1 - sp
x.append(fpr)
y.append(sn)
precision = tp / (1.0 * (tp + fp))
recall = tp / (1.0 * (tp + fn))
f = 2 * precision * recall / (precision + recall)
total +=1
if f_max < f:
f_max = f
p_max = precision
r_max = recall
num_prots = np.sum(labels[i, :])
num_prots_train = np.sum(labels_train[i,:])
if total >1 :
roc_auc = auc(x, y)
else:
roc_auc =0
height = get_height(go , functions[i])
results.append([functions[i], f_max, p_max, r_max, num_prots, num_prots_train, height,roc_auc])
results = pd.DataFrame(results)
#results.to_csv('new_results.txt' , sep='\t' , index = False)
results.to_csv('Con_GodGanSeq_results_'+FUNCTION +'.txt', sep='\t', index=False)
#results = np.array(results)
#p_mean = (np.sum(results[:,2])) / len(functions)
#r_mean = (np.sum(results[:,3])) / len(functions)
#f_mean = (2*p_mean*r_mean)/(p_mean+r_mean)
#roc_auc_mean = (np.sum(results[:,7])) / len(functions)
#print('Function centric performance (macro) ' '%f %f %f %f' % (f_mean, p_mean, r_mean, roc_auc_mean))



def micro_macro_function_centric_f1_backup(preds, labels):
preds = np.round(preds, 2)
m_f1_max = 0
M_f1_max = 0
for t in range(1, 100):
threshold = t / 100.0
predictions = (preds > threshold).astype(np.int32)
m_tp = 0
m_fp = 0
m_fn = 0
M_pr = 0
M_rc = 0
total = 0
p_total = 0
for i in range(len(preds)):
tp = np.sum(predictions[i, :] * labels[i, :])
fp = np.sum(predictions[i, :]) - tp
fn = np.sum(labels[i, :]) - tp
if tp == 0 and fp == 0 and fn == 0:
continue
total += 1
if tp > 0:
pr = tp / (1.0 * (tp + fp))
rc = tp / (1.0 * (tp + fn))
m_tp += tp
m_fp += fp
m_fn += fn
M_pr += pr
M_rc += rc
p_total += 1

if p_total == 0:
continue
if total > 0:
m_tp /= total
m_fn /= total
m_fp /= total
m_pr = m_tp / (1.0 * (m_tp + m_fp))
m_rc = m_tp / (1.0 * (m_tp + m_fn))
M_pr /= p_total
M_rc /= total
m_f1 = 2 * m_pr * m_rc / (m_pr + m_rc)
M_f1 = 2 * M_pr * M_rc / (M_pr + M_rc)

if m_f1 > m_f1_max:
m_f1_max = m_f1
m_pr_max = m_pr
m_rc_max = m_rc

if M_f1 > M_f1_max:
M_f1_max = M_f1
M_pr_max = M_pr
M_rc_max = M_rc

return m_pr_max, m_rc_max, m_f1_max, M_pr_max, M_rc_max, M_f1_max





def micro_macro_function_centric_f1(preds, labels):
preds = np.round(preds, 2)
m_f1_max = 0
M_f1_max = 0
for t in range(1, 200):
threshold = t / 200.0
predictions = (preds > threshold).astype(np.int32)
m_tp = 0
m_fp = 0
m_fn = 0
M_pr = 0
M_rc = 0
for i in range(preds.shape[0]):
tp = np.sum(predictions[i, :] * labels[i, :])
fp = np.sum(predictions[i, :]) - tp
fn = np.sum(labels[i, :]) - tp
m_tp += tp
m_fp += fp
m_fn += fn
if tp > 0:
pr = 1.0 * tp / (1.0 * (tp + fp))
rc = 1.0 * tp / (1.0 * (tp + fn))
else:
if fp == 0 and fn == 0:
pr = 1
rc = 1
else:
pr = 0
rc = 0
M_pr += pr
M_rc += rc

if m_tp > 0:
m_pr = 1.0 * m_tp / (1.0 * (m_tp + m_fp))
m_rc = 1.0 * m_tp / (1.0 * (m_tp + m_fn))
m_f1 = 2.0 * m_pr * m_rc / (m_pr + m_rc)
else:
if m_fp == 0 and m_fn == 0:
m_pr = 1
m_rc = 1
m_f1 = 1
else:
m_pr = 0
m_rc = 0
m_f1 = 0

M_pr /= preds.shape[0]
M_rc /= preds.shape[0]
if M_pr == 0 and M_rc == 0:
M_f1 = 0
else:
M_f1 = 2.0 * M_pr * M_rc / (M_pr + M_rc)
if m_f1 > m_f1_max:
m_f1_max = m_f1
m_pr_max = m_pr
m_rc_max = m_rc

if M_f1 > M_f1_max:
M_f1_max = M_f1
M_pr_max = M_pr
M_rc_max = M_rc

return m_pr_max, m_rc_max, m_f1_max, M_pr_max, M_rc_max, M_f1_max


def compute_roc(preds, labels):
# Compute ROC curve and ROC area for each class
fpr, tpr, _ = roc_curve(labels.flatten(), preds.flatten())
roc_auc = auc(fpr, tpr)
return roc_auc


def compute_aupr(preds, labels):
# Compute ROC curve and ROC area for each class
pr, rc, threshold =precision_recall_curve(labels.flatten(), preds.flatten())
pr_auc = auc(rc, pr)
#pr, rc, threshold =precision_recall_curve(labels.flatten(), preds.flatten(),average ='macro' )
M_pr_auc = 0
return pr_auc, M_pr_auc



def compute_mcc(preds, labels):
# Compute ROC curve and ROC area for each class
mcc = matthews_corrcoef(labels.flatten(), preds.flatten())
return mcc


def compute_performance(preds, labels): #, gos):
preds = np.round(preds, 2)
f_max = 0
p_max = 0
r_max = 0
t_max = 0
for t in range(1, 100):
threshold = t / 100.0
predictions = (preds > threshold).astype(np.int32)
total = 0
f = 0.0
p = 0.0
r = 0.0
p_total = 0
for i in range(labels.shape[0]):
tp = np.sum(predictions[i, :] * labels[i, :])
fp = np.sum(predictions[i, :]) - tp
fn = np.sum(labels[i, :]) - tp
all_gos = set()
#for go_id in gos[i]:
# if go_id in all_functions:
# all_gos |= get_anchestors(go, go_id)
#all_gos.discard(GO_ID)
#all_gos -= func_set
#fn += len(all_gos)
if tp == 0 and fp == 0 and fn == 0:
continue
total += 1
if tp != 0:
p_total += 1
precision = tp / (1.0 * (tp + fp))
recall = tp / (1.0 * (tp + fn))
p += precision
r += recall
if p_total == 0:
continue
r /= total
p /= p_total
if p + r > 0:
f = 2 * p * r / (p + r)
if f_max < f:
f_max = f
p_max = p
r_max = r
t_max = threshold
predictions_max = predictions
return f_max, p_max, r_max, t_max, predictions_max


def get_gos(pred):
mdist = 1.0
mgos = None
for i in range(len(labels_gos)):
labels, gos = labels_gos[i]
dist = distance.cosine(pred, labels)
if mdist > dist:
mdist = dist
mgos = gos
return mgos


if __name__ == '__main__':
main()

Loading…
Cancel
Save