4 years ago · 4e5b5735ce
--- a/conditional_new_nn_hirearchical_seq_mt.py
+++ b/conditional_new_nn_hirearchical_seq_mt.py
@@ -1,917 +0,0 @@
 #!/usr/bin/env python

 """

 """
 from __future__ import division

 import logging
 import sys
 import time
 from collections import deque
 from multiprocessing import Pool

 import click as ck
 import numpy as np
 import pandas as pd
 import tensorflow as tf
 from keras import backend as K
 from keras.callbacks import EarlyStopping, ModelCheckpoint
 from keras.layers import (
    Dense, Input, SpatialDropout1D, Conv1D, MaxPooling1D,
    Flatten, Concatenate, Add, Maximum, Embedding, BatchNormalization, Activation, Dropout)
 from keras.losses import binary_crossentropy
 from keras.models import Sequential, Model, load_model
 from keras.preprocessing import sequence
 from scipy.spatial import distance
 from sklearn.metrics import log_loss
 from sklearn.metrics import roc_curve, auc, matthews_corrcoef
 from keras.layers import Lambda
 from sklearn.metrics import precision_recall_curve

 from utils import (
    get_gene_ontology,
    get_go_set,
    get_anchestors,
    get_parents,
    DataGenerator,
    FUNC_DICT,
    get_height,
    get_ipro)
 from conditional_wgan_wrapper_post import WGAN_wrapper, wasserstein_loss, generator_recunstruction_loss_new

 config = tf.ConfigProto()
 config.gpu_options.allow_growth = True
 sess = tf.Session(config=config)
 K.set_session(sess)

 logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)
 sys.setrecursionlimit(100000)

 DATA_ROOT = 'data/swiss/'
 MAXLEN = 258 #1000
 REPLEN = 256
 ind = 0


@ck.command()
@ck.option(
    '--function',
    default='bp',
    help='Ontology id (mf, bp, cc)')
@ck.option(
    '--device',
    default='gpu:0',
    help='GPU or CPU device id')
@ck.option(
    '--org',
    default= None,
    help='Organism id for filtering test set')
@ck.option('--train',default = True, is_flag=True)
@ck.option('--param', default=0, help='Param index 0-7')
 def main(function, device, org, train, param):
    global FUNCTION
    FUNCTION = function
    global GO_ID
    GO_ID = FUNC_DICT[FUNCTION]
    global go
    go = get_gene_ontology('go.obo')
    global ORG
    ORG = org
    func_df = pd.read_pickle(DATA_ROOT + FUNCTION + '.pkl')
    global functions
    functions = func_df['functions'].values
    global func_set
    func_set = set(functions)
    global all_functions
    all_functions = get_go_set(go, GO_ID)
    logging.info('Functions: %s %d' % (FUNCTION, len(functions)))
    if ORG is not None:
        logging.info('Organism %s' % ORG)
    global go_indexes
    go_indexes = dict()
    for ind, go_id in enumerate(functions):
        go_indexes[go_id] = ind
    global node_names
    node_names = set()
    with tf.device('/' + device):
        params = {
            'fc_output': 1024,
            'learning_rate': 0.001,
            'embedding_dims': 128,
            'embedding_dropout': 0.2,
            'nb_conv': 1,
            'nb_dense': 1,
            'filter_length': 128,
            'nb_filter': 32,
            'pool_length': 64,
            'stride': 32        
        }
        model(params, is_train=train)
        #dims = [64, 128, 256, 512]
        #nb_filters = [16, 32, 64, 128]
        #nb_convs = [1, 2, 3, 4]
        #nb_dense = [1, 2, 3, 4]
        #for i in range(param * 32, param * 32 + 32):
         #   dim = i % 4
         #  i = i / 4
         #   nb_fil = i % 4
         #   i /= 4
         #   conv = i % 4
         #   i /= 4
         #   den = i
         #   params['embedding_dims'] = dims[dim]
         #   params['nb_filter'] = nb_filters[nb_fil]
         #   params['nb_conv'] = nb_convs[conv]
         #   params['nb_dense'] = nb_dense[den]

    # performanc_by_interpro()


 def load_data2():
    all_data_x_fn = 'data2/all_data_X.csv'
    all_data_x = pd.read_csv(all_data_x_fn, sep='\t', header=0, index_col=0)
    all_proteins_train = [p.replace('"', '') for p in all_data_x.index]
    all_data_x.index = all_proteins_train
    all_data_y_fn = 'data2/all_data_Y.csv'
    all_data_y = pd.read_csv(all_data_y_fn, sep='\t', header=0, index_col=0)
    branch = pd.read_csv('data2/'+FUNCTION +'_branches.txt', sep='\t', header=0, index_col=0)
    all_x = all_data_x.values
    branches = [p for p in branch.index.tolist() if p in all_data_y.columns.tolist()]
    t= pd.DataFrame(all_data_y, columns=branches)
    all_y = t.values
    
    number_of_test = int(np.ceil(0.2 * len(all_x)))
    index = np.random.rand(1,number_of_test)
    index_test = [int(p) for p in np.ceil(index*len(all_x))[0] ]
    index_train = [p for p in range(len(all_x)) if p not in index_test]
    train_data = all_x[index_train, : ]   #[ :20000, : ]
    test_data = all_x[index_test, : ]    #[20000: , : ]
    train_labels = all_y[index_train, : ] #[ :20000, : ]
    test_labels = all_y[index_test, :]  #[20000: , : ]
    val_data = test_data
    val_labels = test_labels    
    #print(sum(sum(train_labels)))
    #print(train_data.shape)
    print(train_labels.shape)
    print(test_labels.shape)
    return train_data, train_labels, test_data, test_labels, val_data, val_labels




 def load_data():

    df = pd.read_pickle(DATA_ROOT + 'train' + '-' + FUNCTION + '.pkl')
    n = len(df)
    index = df.index.values
    valid_n = int(n * 0.8)
    train_df = df.loc[index[:valid_n]]
    valid_df = df.loc[index[valid_n:]]
    test_df = pd.read_pickle(DATA_ROOT + 'test' + '-' + FUNCTION + '.pkl')
    print( test_df['orgs'] )
    if ORG is not None:
        logging.info('Unfiltered test size: %d' % len(test_df))
        test_df = test_df[test_df['orgs'] == ORG]
        logging.info('Filtered test size: %d' % len(test_df))

    # Filter by type
    # org_df = pd.read_pickle('data/prokaryotes.pkl')
    # orgs = org_df['orgs']
    # test_df = test_df[test_df['orgs'].isin(orgs)]

    def reshape(values):
        values = np.hstack(values).reshape(
            len(values), len(values[0]))
        return values

    def normalize_minmax(values):
        mn = np.min(values)
        mx = np.max(values)
        if mx - mn != 0.0:
            return (values - mn) / (mx - mn)
        return values - mn

    def get_values(data_frame):
        print(data_frame['labels'].values.shape)
        labels = reshape(data_frame['labels'].values)
        ngrams = sequence.pad_sequences(
            data_frame['ngrams'].values, maxlen=MAXLEN)
        ngrams = reshape(ngrams)
        rep = reshape(data_frame['embeddings'].values)
        data = ngrams
        return data, labels

    train = get_values(train_df)
    valid = get_values(valid_df)
    test = get_values(test_df)

    return train, valid, test, train_df, valid_df, test_df


 def get_feature_model(params):
    embedding_dims = params['embedding_dims']
    max_features = 8001
    model = Sequential()
    model.add(Embedding(
        max_features,
        embedding_dims,
        input_length=MAXLEN))
    model.add(SpatialDropout1D(0.4))
    for i in range(params['nb_conv']):
        model.add(Conv1D(
            activation="relu",
            padding="valid",
            strides=1,
            filters=params['nb_filter'],
            kernel_size=params['filter_length']))
    model.add(MaxPooling1D(strides=params['stride'], pool_size=params['pool_length']))
    model.add(Flatten())
    return model


 def merge_outputs(outputs, name):
    if len(outputs) == 1:
        return outputs[0]
    ## return merge(outputs, mode='concat', name=name, concat_axis=1)
    return Concatenate(axis=1, name=name)(outputs)


 def merge_nets(nets, name):
    if len(nets) == 1:
        return nets[0]
    ## return merge(nets, mode='sum', name=name)
    return Add(name=name)(nets)

 def get_node_name(go_id, unique=False):
    name = go_id.split(':')[1]
    if not unique:
        return name
    if name not in node_names:
        node_names.add(name)
        return name
    i = 1
    while (name + '_' + str(i)) in node_names:
        i += 1
    name = name + '_' + str(i)
    node_names.add(name)
    return name


 def get_function_node(name, inputs):
    output_name = name + '_out'
    # net = Dense(256, name=name, activation='relu')(inputs)
    output = Dense(1, name=output_name, activation='sigmoid')(inputs)
    return output, output


 def get_generator(params, n_classes):
    inputs = Input(shape=(MAXLEN,), dtype='float32', name='input1')
    #feature_model = get_feature_model(params)(inputs)
    net0 = Dense(150, activation='relu')(inputs)
    net0 = Dense(150, activation='relu')(net0)
    #net0 = Dense(50, activation='relu')(net0)
    net = Dense(70, activation = 'relu')(net0)
    output = Dense(n_classes, activation='sigmoid')(net)
    model = Model(inputs=inputs, outputs=output)

    return model


 def get_discriminator(params, n_classes, dropout_rate=0.5):
    inputs = Input(shape=(n_classes, ))
    inputs2 = Input(shape =(MAXLEN,), dtype ='int32', name='d_input2')
    x2 = Embedding(8001,128, input_length=MAXLEN)(inputs2)
    x2 = Conv1D(filters =1 , kernel_size= 1, padding = 'valid', activation ='relu', strides=1)(x2)
    x2 = Lambda(lambda x: K.squeeze(x, 2))(x2)

    #for i in range(params['nb_conv']):
    #    x2 = Conv1D ( activation="relu", padding="valid", strides=1, filters=params['nb_filter'],kernel_size=params['filter_length'])(x2)
    

    #x2 =MaxPooling1D(strides=params['stride'], pool_size=params['pool_length'])(x2)
    #x2 =  Flatten()(x2)
    
    size = 40
    x = inputs
    x = Dropout(dropout_rate)(x)
    x = Dense(size)(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)


    size = 40
    x2 = Dropout(dropout_rate)(x2)
    x2 = Dense(size)(x2)
    x2 = BatchNormalization()(x2)
    x2 = Activation('relu')(x2)


    x = Concatenate(axis =1 , name = 'merged2')([x, x2])
    layer_sizes = [80, 40,30]
    for size in layer_sizes:
        x = Dropout(dropout_rate)(x)
        x = Dense(size)(x)
        x = BatchNormalization()(x)
        x = Activation('relu')(x)


    outputs = Dense(1)(x)
    model = Model(inputs = [inputs ,inputs2], outputs=outputs, name='Discriminator')

    return model



 def get_model(params,nb_classes, batch_size, GRADIENT_PENALTY_WEIGHT=10):
    generator = get_generator(params, nb_classes)
    discriminator = get_discriminator(params, nb_classes)

    generator_model, discriminator_model = \
        WGAN_wrapper(generator=generator,
                     discriminator=discriminator,
                     generator_input_shape=(MAXLEN,),
                     discriminator_input_shape=(nb_classes,),
                     discriminator_input_shape2 = (MAXLEN, ),
                     batch_size=batch_size,
                     gradient_penalty_weight=GRADIENT_PENALTY_WEIGHT)

    logging.info('Compilation finished')
    return generator_model, discriminator_model


 def train_wgan(generator_model, discriminator_model, batch_size, epochs,
               x_train, y_train, x_val, y_val, generator_model_path, discriminator_model_path,
               TRAINING_RATIO=10, N_WARM_UP=0):
    BATCH_SIZE = batch_size
    N_EPOCH = epochs
    
   
    positive_y = np.ones((batch_size, 1), dtype=np.float32)
    zero_y = positive_y * 0
    negative_y = -positive_y
    positive_full_y = np.ones((BATCH_SIZE * TRAINING_RATIO, 1), dtype=np.float32)
    dummy_y = np.zeros((BATCH_SIZE, 1), dtype=np.float32)
    
    positive_full_enable_train = np.ones((len(x_train), 1), dtype = np.float32 )
    positive_full_enable_val = np.ones((len(x_val), 1), dtype =np.float32 )
    #positive_enable_train = np.ones((1, batch_size),dtype = np.float32 )
    #positive_full_train_enable = np.ones((1,BATCH_SIZE * TRAINING_RATIO ), dtype=np.float32 )
    best_validation_loss = None

    for epoch in range(N_EPOCH):
        # np.random.shuffle(X_train)
        print("Epoch: ", epoch)
        print("Number of batches: ", int(y_train.shape[0] // BATCH_SIZE))
        discriminator_loss = []
        generator_loss = []
        minibatches_size = BATCH_SIZE * TRAINING_RATIO

        shuffled_indexes = np.random.permutation(x_train.shape[0])
        shuffled_indexes_2 = np.random.permutation(x_train.shape[0])

        for i in range(int(y_train.shape[0] // (BATCH_SIZE * TRAINING_RATIO))):
            batch_indexes = shuffled_indexes[i * minibatches_size:(i + 1) * minibatches_size]
            batch_indexes_2 = shuffled_indexes_2[i * minibatches_size:(i + 1) * minibatches_size]
            x = x_train[batch_indexes]
            y = y_train[batch_indexes]
            y_2 = y_train[batch_indexes_2]
            x_2 = x_train[batch_indexes_2]
            if epoch < N_WARM_UP:
                for j in range(TRAINING_RATIO):
                    x_batch = x[j * BATCH_SIZE:(j + 1) * BATCH_SIZE]
                    y_batch = y[j * BATCH_SIZE:(j + 1) * BATCH_SIZE]

                    generator_loss.append(generator_model.train_on_batch([x_batch, positive_y], [y_batch, zero_y]))
            else:
                for j in range(TRAINING_RATIO):
                    x_batch = x[j * BATCH_SIZE:(j + 1) * BATCH_SIZE]
                    y_batch_2 = y_2[j * BATCH_SIZE:(j + 1) * BATCH_SIZE]
                    x_batch_2 = x_2[j * BATCH_SIZE:(j + 1) * BATCH_SIZE]
                    # noise = np.random.rand(BATCH_SIZE, 100).astype(np.float32)
                    noise = x_batch
                    #print(sum(y_batch_2))
                    discriminator_loss.append(discriminator_model.train_on_batch(
                        [y_batch_2,  noise, x_batch_2 ],
                        [positive_y, negative_y, dummy_y]))

                generator_loss.append(generator_model.train_on_batch([x,positive_full_y], [y, positive_full_y]))
        # Still needs some code to display losses from the generator and discriminator, progress bars, etc.
        predicted_y_train, _ = generator_model.predict([x_train , positive_full_enable_train], batch_size=BATCH_SIZE)
        predicted_y_val, _ = generator_model.predict([ x_val , positive_full_enable_val ], batch_size=BATCH_SIZE)
        
        #print(sum(sum(positive_full_enable_train)))
        #print(predicted_y_train)
        train_loss = log_loss(y_train, predicted_y_train)
        val_loss = log_loss(y_val, predicted_y_val)

        print("train loss: {:.4f}, validation loss: {:.4f}, discriminator loss: {:.4f}".format(
            train_loss, val_loss,
            (np.sum(np.asarray(discriminator_loss)) if discriminator_loss else -1) / x_train.shape[0]))

        if best_validation_loss is None or best_validation_loss > val_loss:
            print('\nEpoch %05d: improved from %0.5f,'
                  ' saving model to %s and %s'
                  % (epoch + 1, val_loss, generator_model_path, discriminator_model_path))

            best_validation_loss = val_loss
            generator_model.save(generator_model_path, overwrite=True)
            discriminator_model.save(discriminator_model_path, overwrite=True)


 def model(params, batch_size=20, nb_epoch=40, is_train=True):
    # set parameters:
    #nb_classes = len(functions)
    start_time = time.time()
    logging.info("Loading Data")
    
    ##
    #train, val, test, train_df, valid_df, test_df = load_data()
    #train_df = pd.concat([train_df, valid_df])
    #test_gos = test_df['gos'].values
    #train_data, train_labels = train
    #val_data, val_labels = val
    #test_data, test_labels = test
    ##    

    train_data, train_labels, test_data, test_labels, val_data, val_labels = load_data2() 
    nb_classes = train_labels.shape[1]

    logging.info("Data loaded in %d sec" % (time.time() - start_time))
    logging.info("Training data size: %d" % len(train_data))
    logging.info("Validation data size: %d" % len(val_data))
    logging.info("Test data size: %d" % len(test_data))
    generator_model_path = DATA_ROOT + 'models/new_model_seq_' + FUNCTION + '.h5'
    discriminator_model_path = DATA_ROOT + 'models/new_model_disc_seq_' + FUNCTION + '.h5'


    logging.info('Starting training the model')

    train_generator = DataGenerator(batch_size, nb_classes)
    train_generator.fit(train_data, train_labels)
    valid_generator = DataGenerator(batch_size, nb_classes)
    valid_generator.fit(val_data, val_labels)
    test_generator = DataGenerator(batch_size, nb_classes)
    test_generator.fit(test_data, test_labels)

    if is_train:
        generator_model, discriminator_model = get_model(params, nb_classes, batch_size)
        train_wgan(generator_model, discriminator_model, batch_size=batch_size, epochs=nb_epoch,
                   x_train=train_data, y_train=train_labels, x_val=val_data, y_val=val_labels,
                   generator_model_path=generator_model_path,
                   discriminator_model_path=discriminator_model_path)

    logging.info('Loading best model')
    model = load_model(generator_model_path,
                       custom_objects={'generator_recunstruction_loss_new': generator_recunstruction_loss_new,
                                       'wasserstein_loss': wasserstein_loss})

    logging.info('Predicting')
    preds = model.predict_generator(test_generator, steps=len(test_data) / batch_size)[0]
    
    # incon = 0
    # for i in xrange(len(test_data)):
    #     for j in xrange(len(functions)):
    #         childs = set(go[functions[j]]['children']).intersection(func_set)
    #         ok = True`
    #         for n_id in childs:
    #             if preds[i, j] < preds[i, go_indexes[n_id]]:
    #                 preds[i, j] = preds[i, go_indexes[n_id]]
    #                 ok = False
    #         if not ok:
    #             incon += 1
    logging.info('Computing performance')
    f, p, r, t, preds_max = compute_performance(preds, test_labels) #, test_gos)
    roc_auc = compute_roc(preds, test_labels)
    mcc = compute_mcc(preds_max, test_labels)
    aupr , _ = compute_aupr(preds, test_labels)
    m_pr_max, m_rc_max, m_f1_max, M_pr_max, M_rc_max, M_f1_max = micro_macro_function_centric_f1(preds.T, test_labels.T)

    logging.info('Protein centric macro Th, PR, RC, F1: \t %f %f %f %f' % (t, p, r, f))
    logging.info('ROC AUC: \t %f ' % (roc_auc, ))
    logging.info('MCC: \t %f ' % (mcc, ))
    logging.info('AUPR: \t %f ' % (aupr, ))
    logging.info('Function centric macro PR, RC, F1: \t %f %f %f'  % (M_pr_max, M_rc_max, M_f1_max) )
    logging.info('Function centric micro PR, RC, F1: \t %f %f %f'  % (m_pr_max, m_rc_max, m_f1_max) )

    function_centric_performance(functions, preds.T, test_labels.T, train_labels.T)



 def load_prot_ipro():
    proteins = list()
    ipros = list()
    with open(DATA_ROOT + 'swissprot_ipro.tab') as f:
        for line in f:
            it = line.strip().split('\t')
            if len(it) != 3:
                continue
            prot = it[1]
            iprs = it[2].split(';')
            proteins.append(prot)
            ipros.append(iprs)
    return pd.DataFrame({'proteins': proteins, 'ipros': ipros})


 def performanc_by_interpro():
    pred_df = pd.read_pickle(DATA_ROOT + 'test-' + FUNCTION + '-preds.pkl')
    ipro_df = load_prot_ipro()
    df = pred_df.merge(ipro_df, on='proteins', how='left')
    ipro = get_ipro()

    def reshape(values):
        values = np.hstack(values).reshape(
            len(values), len(values[0]))
        return values

    for ipro_id in ipro:
        if len(ipro[ipro_id]['parents']) > 0:
            continue
        labels = list()
        predictions = list()
        gos = list()
        for i, row in df.iterrows():
            if not isinstance(row['ipros'], list):
                continue
            if ipro_id in row['ipros']:
                labels.append(row['labels'])
                predictions.append(row['predictions'])
                gos.append(row['gos'])
        pr = 0
        rc = 0
        total = 0
        p_total = 0
        for i in range(len(labels)):
            tp = np.sum(labels[i] * predictions[i])
            fp = np.sum(predictions[i]) - tp
            fn = np.sum(labels[i]) - tp
            all_gos = set()
            for go_id in gos[i]:
                if go_id in all_functions:
                    all_gos |= get_anchestors(go, go_id)
            all_gos.discard(GO_ID)
            all_gos -= func_set
            fn += len(all_gos)
            if tp == 0 and fp == 0 and fn == 0:
                continue
            total += 1
            if tp != 0:
                p_total += 1
                precision = tp / (1.0 * (tp + fp))
                recall = tp / (1.0 * (tp + fn))
                pr += precision
                rc += recall
        if total > 0 and p_total > 0:
            rc /= total
            pr /= p_total
            if pr + rc > 0:
                f = 2 * pr * rc / (pr + rc)
                logging.info('%s\t%d\t%f\t%f\t%f' % (
                    ipro_id, len(labels), f, pr, rc))


 def function_centric_performance(functions, preds, labels, labels_train):
    results = []
    preds = np.round(preds, 2)
    for i in range(preds.shape[0]):
        f_max = 0
        p_max = 0
        r_max = 0
        for t in range(1, 100):
            threshold = t / 100.0
            predictions = (preds[i, :] > threshold).astype(np.int32)
            tp = np.sum(predictions * labels[i, :])
            fp = np.sum(predictions) - tp
            fn = np.sum(labels[i, :]) - tp
            if tp > 0:
                precision = tp / (1.0 * (tp + fp))
                recall = tp / (1.0 * (tp + fn))
                f = 2 * precision * recall / (precision + recall)
            else:
                if fp == 0 and fn == 0:
                    precision = 1
                    recall = 1
                    f = 1
                else:
                    precision = 0
                    recall = 0
                    f = 0

            if f_max < f:
                f_max = f
                p_max = precision
                r_max = recall
        num_prots_train = np.sum(labels_train[i, :])
        height = get_height(go, functions[i])
        results.append([functions[i], num_prots_train, height, f_max, p_max, r_max])
    results = pd.DataFrame(results)
    results.to_csv('Con_GodGanSeq_results_' + FUNCTION + '.txt', sep='\t', index=False)



 def function_centric_performance_backup(functions, preds, labels, labels_train):
    results = []
    preds = np.round(preds, 2)
    for i in range(len(functions)):
        f_max = 0
        p_max = 0
        r_max = 0
        x = list()
        y = list()
        total = 0
        for t in range(1, 100):
            threshold = t / 100.0
            predictions = (preds[i, :] > threshold).astype(np.int32)
            tp = np.sum(predictions * labels[i, :])
            fp = np.sum(predictions) - tp
            fn = np.sum(labels[i, :]) - tp
            if tp >0:    
                sn = tp / (1.0 * np.sum(labels[i, :]))
                sp = np.sum((predictions ^ 1) * (labels[i, :] ^ 1))
                sp /= 1.0 * np.sum(labels[i, :] ^ 1)
                fpr = 1 - sp
                x.append(fpr)
                y.append(sn)
                precision = tp / (1.0 * (tp + fp))
                recall = tp / (1.0 * (tp + fn))
                f = 2 * precision * recall / (precision + recall)
                total +=1
                if f_max < f:
                    f_max = f
                    p_max = precision
                    r_max = recall
        num_prots = np.sum(labels[i, :])
        num_prots_train = np.sum(labels_train[i,:])
        if total >1 :
            roc_auc = auc(x, y)
        else:
            roc_auc =0
        height = get_height(go , functions[i])
        results.append([functions[i], f_max, p_max, r_max, num_prots, num_prots_train, height,roc_auc])
    results = pd.DataFrame(results)    
    #results.to_csv('new_results.txt' , sep='\t' , index = False)
    results.to_csv('Con_GodGanSeq_results_'+FUNCTION +'.txt', sep='\t', index=False)
    #results = np.array(results)
    #p_mean = (np.sum(results[:,2])) / len(functions)
    #r_mean =  (np.sum(results[:,3])) / len(functions)
    #f_mean = (2*p_mean*r_mean)/(p_mean+r_mean)
    #roc_auc_mean = (np.sum(results[:,7])) / len(functions)
    #print('Function centric performance (macro) ' '%f %f %f %f' % (f_mean, p_mean, r_mean, roc_auc_mean))



 def micro_macro_function_centric_f1_backup(preds, labels):
    preds = np.round(preds, 2)
    m_f1_max = 0
    M_f1_max = 0
    for t in range(1, 100):
        threshold = t / 100.0
        predictions = (preds > threshold).astype(np.int32)
        m_tp = 0
        m_fp = 0
        m_fn = 0
        M_pr = 0
        M_rc = 0
        total = 0
        p_total = 0
        for i in range(len(preds)):
            tp = np.sum(predictions[i, :] * labels[i, :])
            fp = np.sum(predictions[i, :]) - tp
            fn = np.sum(labels[i, :]) - tp
            if tp == 0 and fp == 0 and fn == 0:
                continue
            total += 1
            if tp > 0:
                pr = tp / (1.0 * (tp + fp))
                rc = tp / (1.0 * (tp + fn))
                m_tp += tp
                m_fp += fp
                m_fn += fn
                M_pr += pr
                M_rc += rc
                p_total += 1

        if p_total == 0:
            continue
        if total > 0:
            m_tp /= total
            m_fn /= total
            m_fp /= total
            m_pr = m_tp / (1.0 * (m_tp + m_fp))
            m_rc = m_tp / (1.0 * (m_tp + m_fn))
            M_pr /= p_total
            M_rc /= total
            m_f1 = 2 * m_pr * m_rc / (m_pr + m_rc)
            M_f1 = 2 * M_pr * M_rc / (M_pr + M_rc)

            if m_f1 > m_f1_max:
                m_f1_max = m_f1
                m_pr_max = m_pr
                m_rc_max = m_rc

            if M_f1 > M_f1_max:
                M_f1_max = M_f1
                M_pr_max = M_pr
                M_rc_max = M_rc

    return m_pr_max, m_rc_max, m_f1_max, M_pr_max, M_rc_max, M_f1_max





 def micro_macro_function_centric_f1(preds, labels):
    preds = np.round(preds, 2)
    m_f1_max = 0
    M_f1_max = 0
    for t in range(1, 200):
        threshold = t / 200.0
        predictions = (preds > threshold).astype(np.int32)
        m_tp = 0
        m_fp = 0
        m_fn = 0
        M_pr = 0
        M_rc = 0
        for i in range(preds.shape[0]):
            tp = np.sum(predictions[i, :] * labels[i, :])
            fp = np.sum(predictions[i, :]) - tp
            fn = np.sum(labels[i, :]) - tp
            m_tp += tp
            m_fp += fp
            m_fn += fn
            if tp > 0:
                pr = 1.0 * tp / (1.0 * (tp + fp))
                rc = 1.0 * tp / (1.0 * (tp + fn))
            else:
                if fp == 0 and fn == 0:
                    pr = 1
                    rc = 1
                else:
                    pr = 0
                    rc = 0
            M_pr += pr
            M_rc += rc

        if m_tp > 0:
            m_pr = 1.0 * m_tp / (1.0 * (m_tp + m_fp))
            m_rc = 1.0 * m_tp / (1.0 * (m_tp + m_fn))
            m_f1 = 2.0 * m_pr * m_rc / (m_pr + m_rc)
        else:
            if m_fp == 0 and m_fn == 0:
                m_pr = 1
                m_rc = 1
                m_f1 = 1
            else:
                m_pr = 0
                m_rc = 0
                m_f1 = 0

        M_pr /= preds.shape[0]
        M_rc /= preds.shape[0]
        if M_pr == 0 and M_rc == 0:
            M_f1 = 0
        else:
            M_f1 = 2.0 * M_pr * M_rc / (M_pr + M_rc)
        if m_f1 > m_f1_max:
            m_f1_max = m_f1
            m_pr_max = m_pr
            m_rc_max = m_rc

        if M_f1 > M_f1_max:
            M_f1_max = M_f1
            M_pr_max = M_pr
            M_rc_max = M_rc

    return m_pr_max, m_rc_max, m_f1_max, M_pr_max, M_rc_max, M_f1_max


 def compute_roc(preds, labels):
    # Compute ROC curve and ROC area for each class
    fpr, tpr, _ = roc_curve(labels.flatten(), preds.flatten())
    roc_auc = auc(fpr, tpr)
    return roc_auc


 def compute_aupr(preds, labels):
    # Compute ROC curve and ROC area for each class
    pr, rc, threshold =precision_recall_curve(labels.flatten(), preds.flatten())
    pr_auc = auc(rc, pr)
    #pr, rc, threshold =precision_recall_curve(labels.flatten(), preds.flatten(),average ='macro' )
    M_pr_auc = 0 
    return pr_auc, M_pr_auc



 def compute_mcc(preds, labels):
    # Compute ROC curve and ROC area for each class
    mcc = matthews_corrcoef(labels.flatten(), preds.flatten())
    return mcc


 def compute_performance(preds, labels): #, gos):
    preds = np.round(preds, 2)
    f_max = 0
    p_max = 0
    r_max = 0
    t_max = 0
    for t in range(1, 100):
        threshold = t / 100.0
        predictions = (preds > threshold).astype(np.int32)
        total = 0
        f = 0.0
        p = 0.0
        r = 0.0
        p_total = 0
        for i in range(labels.shape[0]):
            tp = np.sum(predictions[i, :] * labels[i, :])
            fp = np.sum(predictions[i, :]) - tp
            fn = np.sum(labels[i, :]) - tp
            all_gos = set()
            #for go_id in gos[i]:
            #    if go_id in all_functions:
            #        all_gos |= get_anchestors(go, go_id)
            #all_gos.discard(GO_ID)
            #all_gos -= func_set
            #fn += len(all_gos)
            if tp == 0 and fp == 0 and fn == 0:
                continue
            total += 1
            if tp != 0:
                p_total += 1
                precision = tp / (1.0 * (tp + fp))
                recall = tp / (1.0 * (tp + fn))
                p += precision
                r += recall
        if p_total == 0:
            continue
        r /= total
        p /= p_total
        if p + r > 0:
            f = 2 * p * r / (p + r)
            if f_max < f:
                f_max = f
                p_max = p
                r_max = r
                t_max = threshold
                predictions_max = predictions
    return f_max, p_max, r_max, t_max, predictions_max


 def get_gos(pred):
    mdist = 1.0
    mgos = None
    for i in range(len(labels_gos)):
        labels, gos = labels_gos[i]
        dist = distance.cosine(pred, labels)
        if mdist > dist:
            mdist = dist
            mgos = gos
    return mgos


 def compute_similarity_performance(train_df, test_df, preds):
    logging.info("Computing similarity performance")
    logging.info("Training data size %d" % len(train_df))
    train_labels = train_df['labels'].values
    train_gos = train_df['gos'].values
    global labels_gos
    labels_gos = zip(train_labels, train_gos)
    p = Pool(64)
    pred_gos = p.map(get_gos, preds)
    total = 0
    p = 0.0
    r = 0.0
    f = 0.0
    test_gos = test_df['gos'].values
    for gos, tgos in zip(pred_gos, test_gos):
        preds = set()
        test = set()
        for go_id in gos:
            if go_id in all_functions:
                preds |= get_anchestors(go, go_id)
        for go_id in tgos:
            if go_id in all_functions:
                test |= get_anchestors(go, go_id)
        tp = len(preds.intersection(test))
        fp = len(preds - test)
        fn = len(test - preds)
        if tp == 0 and fp == 0 and fn == 0:
            continue
        total += 1
        if tp != 0:
            precision = tp / (1.0 * (tp + fp))
            recall = tp / (1.0 * (tp + fn))
            p += precision
            r += recall
            f += 2 * precision * recall / (precision + recall)
    return f / total, p / total, r / total


 def print_report(report, go_id):
    with open(DATA_ROOT + 'reports.txt', 'a') as f:
        f.write('Classification report for ' + go_id + '\n')
        f.write(report + '\n')


 if __name__ == '__main__':
    main()