1 year ago · ce5adb5616
--- a/DeepDRA.py
+++ b/DeepDRA.py
@@ -5,7 +5,7 @@ from torch.optim import lr_scheduler
 from autoencoder import Autoencoder
 from evaluation import Evaluation
 from mlp import MLP
 from utils import *
 from utils import MODEL_FOLDER


 class DeepDRA(nn.Module):
@@ -162,10 +162,6 @@ def test(model, test_loader, reverse=False):
        # Apply reverse if specified
        predictions = 1 - mlp_output if reverse else mlp_output

        # # Store predictions and ground truth labels
        # all_predictions.extend(predictions.cpu().numpy())
        # all_labels.extend(labels.cpu().numpy())

    # Evaluate the predictions using the specified metrics
    result = Evaluation.evaluate(labels, predictions)

--- a/data_loader.py
+++ b/data_loader.py
@@ -1,6 +1,11 @@
 from utils import *
 import pandas as pd
 import numpy as np
 from tqdm import tqdm
 from sklearn.impute import SimpleImputer
 import os

 from utils import DRUG_DATA_FOLDER


 class RawDataLoader:
    @staticmethod
@@ -39,7 +44,6 @@ class RawDataLoader:
        # Step 5: Return the loaded data and adjusted drug screening data
        return data, drug_screen


    @staticmethod
    def intersect_features(data1, data2):
        """
@@ -109,7 +113,6 @@ class RawDataLoader:

        return data[0]


    @staticmethod
    def load_raw_files(raw_file_directory, data_modalities, intersect=True):
        raw_dict = {}
@@ -132,8 +135,6 @@ class RawDataLoader:
                df.columns = df.columns.str.replace('_cell_CN', '')
                df.columns = df.columns.str.replace('_cell_exp', '')



                # Note that drug_comp raw table has some NA values so we should impute it
                if any(df.isna()):
                    df = pd.DataFrame(SimpleImputer(strategy='mean').fit_transform(df),
@@ -168,13 +169,12 @@ class RawDataLoader:
    @staticmethod
    def load_screening_files(filename="AUC_matS_comb.tsv", sep=',', ):
        df = pd.read_csv(filename, sep=sep, index_col=0)
        # df = df.drop(['Erlotinib','17-AAG','PD-0325901','PHA-665752','PHA-665752','TAE684','Sorafenib','PLX4720','selumetinib','PD-0332991','Paclitaxel','Nilotinib','Saracatinib'],axis=1)
        return df
        # return pd.read_csv(os.path.join(DATA_FOLDER, "drug_screening_matrix_GDSC.tsv"), sep='\t', index_col=0)

    @staticmethod
    def adjust_screening_raw(drug_screen, data_dict):
        raw_cell_names = []
        raw_drug_names = []
        for key, value in data_dict.items():
            if 'cell' in key:
                if len(raw_cell_names) == 0:
@@ -187,7 +187,6 @@ class RawDataLoader:
        screening_cell_names = drug_screen.index
        screening_drug_names = drug_screen.columns


        common_cell_names = list(set(raw_cell_names).intersection(set(screening_cell_names)))
        common_drug_names = list(set(raw_drug_names).intersection(set(screening_drug_names)))
        for key, value in data_dict.items():
--- a/evaluation.py
+++ b/evaluation.py
@@ -1,66 +1,102 @@
 from sklearn.metrics import roc_auc_score, average_precision_score, confusion_matrix, f1_score, precision_score, \
    recall_score, accuracy_score
 from matplotlib import pyplot as plt
 from sklearn import metrics
 import numpy as np
 import pandas as pd
 from statsmodels.stats.weightstats import ttest_ind

 from utils import *


 class Evaluation:
    @staticmethod
    def plot_train_val_accuracy(train_accuracies, val_accuracies, num_epochs):
        plt.xlabel('epoch')
        plt.ylabel('accuracy')
        plt.title('h')
        plt.plot(range(1, num_epochs + 1), train_accuracies)
        plt.plot(range(1, num_epochs + 1), val_accuracies)
        """
        Plot training and validation accuracies over epochs.

        Parameters:
        - train_accuracies (list): List of training accuracies.
        - val_accuracies (list): List of validation accuracies.
        - num_epochs (int): Number of training epochs.

        Returns:
        - None
        """
        plt.xlabel('Epoch')
        plt.ylabel('Accuracy')
        plt.title('Training and Validation Accuracies')
        plt.plot(range(1, num_epochs + 1), train_accuracies, label='Train Accuracy')
        plt.plot(range(1, num_epochs + 1), val_accuracies, label='Validation Accuracy')
        plt.legend()
        plt.show()

    @staticmethod
    def plot_train_val_loss(train_loss, val_loss, num_epochs):
        plt.xlabel('epoch')
        plt.ylabel('loss')
        plt.title('h')
        plt.plot(range(1, num_epochs + 1), train_loss)
        plt.plot(range(1, num_epochs + 1), val_loss)
        """
        Plot training and validation losses over epochs.

        Parameters:
        - train_loss (list): List of training losses.
        - val_loss (list): List of validation losses.
        - num_epochs (int): Number of training epochs.

        Returns:
        - None
        """
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.title('Training and Validation Losses')
        plt.plot(range(1, num_epochs + 1), train_loss, label='Train Loss')
        plt.plot(range(1, num_epochs + 1), val_loss, label='Validation Loss')
        plt.legend()
        plt.show()


    @staticmethod
    def evaluate(all_targets, mlp_output, show_plot=True):
        """
        Evaluate model performance based on predictions and targets.

        Parameters:
        - all_targets (numpy.ndarray): True target labels.
        - mlp_output (numpy.ndarray): Predicted probabilities.
        - show_plot (bool): Whether to display ROC and PR curves.

        Returns:
        - results (dict): Dictionary containing evaluation metrics.
        """
        # Step 1: Convert predicted probabilities to binary labels
        predicted_labels = np.where(mlp_output > 0.5, 1, 0)
        # Collect predictions and targets for later evaluation
        predicted_labels = predicted_labels.reshape(-1)
        # Convert predictions and targets to numpy arrays
        all_predictions = predicted_labels

        # Calculate and print AUC
        # Step 2: Calculate and print AUC
        fpr, tpr, thresholds = metrics.roc_curve(all_targets, mlp_output)
        auc = np.round(metrics.auc(fpr, tpr), 2)

        # Calculate and print AUPRC
        print(all_targets)
        # Step 3: Calculate and print AUPRC
        precision, recall, thresholds = metrics.precision_recall_curve(all_targets, mlp_output)
        auprc = np.round(metrics.auc(recall, precision), 2)
        # auprc = average_precision_score(all_targets, mlp_output)

        print('Accuracy: {:.2f}'.format(np.round(accuracy_score(all_targets, all_predictions), 2)))
        print('AUC: {:.2f}'.format(auc))
        print('AUPRC: {:.2f}'.format(auprc))

        # Calculate and print confusion matrix
        # Step 4: Print accuracy, AUC, AUPRC, and confusion matrix
        accuracy = accuracy_score(all_targets, all_predictions)
        cm = confusion_matrix(all_targets, all_predictions)
        accuracy = cm.trace() / np.sum(cm)
        precision = cm[0, 0] / (cm[0, 0] + cm[0, 1])
        recall = cm[0, 0] / (cm[0, 0] + cm[1, 0])
        f1_score = 2 * precision * recall / (precision + recall)
        print('Confusion matrix:\n', cm, sep='')
        print(f'Accuracy: {accuracy:.3f}, Precision: {precision:.3f}, Recall: {recall:.3f}, F1 score: {f1_score:.3f}')
        print(f'Accuracy: {accuracy:.2f}')
        print(f'AUC: {auc:.2f}')
        print(f'AUPRC: {auprc:.2f}')
        print(f'Confusion matrix:\n{cm}')
        print(f'Precision: {precision:.3f}, Recall: {recall:.3f}, F1 score: {f1_score:.3f}')

        # Step 5: Display ROC and PR curves if requested
        if show_plot:
            plt.xlabel('False Positive Rate')
            plt.ylabel('True Positive Rate')
            plt.title(f'ROC Curve: AUC={auc}')
            plt.plot(fpr, tpr)
            plt.show()
            # print(f'AUC: {auc}')

            plt.xlabel('Recall')
            plt.ylabel('Precision')
@@ -68,6 +104,7 @@ class Evaluation:
            plt.plot(recall, precision)
            plt.show()

            # Violin plot for DeepDRA scores
            prediction_targets = pd.DataFrame({}, columns=['Prediction', 'Target'])

            res = pd.concat(
@@ -76,11 +113,8 @@ class Evaluation:

            res.columns = prediction_targets.columns
            prediction_targets = pd.concat([prediction_targets, res])

            class_one = prediction_targets.loc[prediction_targets['Target'] == 0, 'Prediction'].astype(
                np.float32).tolist()
            class_minus_one = prediction_targets.loc[prediction_targets['Target'] == 1, 'Prediction'].astype(
                np.float32).tolist()
            class_one = prediction_targets.loc[prediction_targets['Target'] == 0, 'Prediction']
            class_minus_one = prediction_targets.loc[prediction_targets['Target'] == 1, 'Prediction']

            fig, ax = plt.subplots()
            ax.set_ylabel("DeepDRA score")
@@ -92,12 +126,13 @@ class Evaluation:
            p_value = np.format_float_scientific(ttest_ind(class_one, class_minus_one)[1])
            cancer = 'all'
            plt.title(
                f'Responder/Non responder scores for {cancer} cancer with \np-value ~= {p_value[0]}e{p_value[-3:]} ')
                f'Responder/Non-responder scores for {cancer} cancer with \np-value ~= {p_value[0]}e{p_value[-3:]} ')
            bp = ax.violinplot(data_to_plot, showextrema=True, showmeans=True, showmedians=True)
            bp['cmeans'].set_color('r')
            bp['cmedians'].set_color('g')
            plt.show()

        # Step 6: Return evaluation metrics in a dictionary
        return {'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1 score': f1_score, 'AUC': auc,
                'AUPRC': auprc}

--- a/main.py
+++ b/main.py
@@ -7,10 +7,10 @@ from DeepDRA import DeepDRA, train, test
 from data_loader import RawDataLoader
 from evaluation import Evaluation
 from utils import *
 from mlp import MLP
 import random
 import torch
 import numpy as np
 import pandas as pd


 def train_DeepDRA(x_cell_train, x_cell_test, x_drug_train, x_drug_test, y_train, y_test, cell_sizes, drug_sizes):
@@ -60,10 +60,10 @@ def train_DeepDRA(x_cell_train, x_cell_test, x_drug_train, x_drug_test, y_train,
    train(model, train_loader, num_epochs=num_epochs)

    # Step 7: Save the trained model
    torch.save(model, 'DeepDRA.pth')
    torch.save(model,  MODEL_FOLDER + 'DeepDRA.pth')

    # Step 8: Load the saved model
    model = torch.load('DeepDRA.pth')
    model = torch.load( MODEL_FOLDER + 'DeepDRA.pth')

    # Step 9: Convert your test data to PyTorch tensors
    x_cell_test_tensor = torch.Tensor(x_cell_test.values)
--- a/mlp.py
+++ b/mlp.py
@@ -2,7 +2,6 @@ from torch import nn
 from torch import optim, no_grad
 import torch
 from evaluation import Evaluation
 from utils import data_modalities_abbreviation
 class EarlyStopper:
    def __init__(self, patience=1, min_delta=0):
        self.patience = patience
--- a/utils.py
+++ b/utils.py
@@ -1,40 +1,15 @@
 import os
 import pandas as pd
 import numpy as np
 from tqdm import tqdm
 import sklearn as sk
 from matplotlib import pyplot as plt
 from scipy.spatial.distance import pdist, squareform
 import h2o
 from h2o.estimators import H2ODeepLearningEstimator
 from sklearn.impute import SimpleImputer
 import torch
 import torch.nn as nn
 import torch.optim as optim
 from torch.utils.data import DataLoader, TensorDataset
 import pickle
 from sklearn import metrics
 from copy import deepcopy
 import pyreadr
 import requests
 from time import time
 from math import ceil
 from statsmodels.stats.weightstats import ttest_ind
 import torch.optim.lr_scheduler as lr_scheduler
 from sklearn.model_selection import KFold


 DATA_FOLDER = 'data'
 RES_DATA_FOLDER = os.path.join(DATA_FOLDER, 'res')
 TEST_DATA_FOLDER = os.path.join(DATA_FOLDER, 'final_test_data')
 TEST_TCGA_DATA_FOLDER = os.path.join(DATA_FOLDER, 'TCGA_test_data')
 SIM_DATA_FOLDER = os.path.join(DATA_FOLDER, 'similarity_data')
 RAW_DATA_FOLDER = os.path.join(DATA_FOLDER, 'raw_data')
 RAW_BOTH_DATA_FOLDER = os.path.join(DATA_FOLDER, 'CTRP_GDSC_Data')
 RAW_BOTH_DATA_FOLDER = os.path.join(DATA_FOLDER, 'CTRP_GDSC_data')
 DRUG_DATA_FOLDER = os.path.join(DATA_FOLDER, 'drug_data')

 NEW_RAW_DATA_FOLDER = os.path.join(DATA_FOLDER, 'new_raw_data')
 GDSC_RAW_DATA_FOLDER = os.path.join(DATA_FOLDER, 'GDSC_data')
 CCLE_RAW_DATA_FOLDER = os.path.join(DATA_FOLDER, 'CCLE_raw')
 CCLE_RAW_DATA_FOLDER = os.path.join(DATA_FOLDER, 'CCLE_data')
 GDSC_SCREENING_DATA_FOLDER = os.path.join(GDSC_RAW_DATA_FOLDER, 'drug_screening_matrix_GDSC.tsv')
 CCLE_SCREENING_DATA_FOLDER = os.path.join(CCLE_RAW_DATA_FOLDER, 'drug_screening_matrix_ccle.tsv')
 BOTH_SCREENING_DATA_FOLDER = os.path.join(RAW_BOTH_DATA_FOLDER, 'drug_screening_matrix_gdsc_ctrp.tsv')

 CTRP_FOLDER = os.path.join(DATA_FOLDER, 'CTRP')
 GDSC_FOLDER = os.path.join(DATA_FOLDER, 'GDSC')
@@ -42,32 +17,7 @@ CCLE_FOLDER = os.path.join(DATA_FOLDER, 'CCLE')

 MODEL_FOLDER = os.path.join(DATA_FOLDER, 'model')

 CTRP_EXPERIMENT_FILE = os.path.join(CTRP_FOLDER, 'v20.meta.per_experiment.txt')
 CTRP_COMPOUND_FILE = os.path.join(CTRP_FOLDER, 'v20.meta.per_compound.txt')
 CTRP_CELLLINE_FILE = os.path.join(CTRP_FOLDER, 'v20.meta.per_cell_line.txt')
 CTRP_AUC_FILE = os.path.join(CTRP_FOLDER, 'v20.data.curves_post_qc.txt')

 GDSC_AUC_FILE = os.path.join(GDSC_FOLDER, 'GDSC2_fitted_dose_response.csv')
 GDSC_cnv_data_FILE = os.path.join(GDSC_FOLDER, 'cnv_abs_copy_number_picnic_20191101.csv')
 GDSC_methy_data_FILE = os.path.join(GDSC_FOLDER, 'F2_METH_CELL_DATA.txt')
 GDSC_methy_sampleIds_FILE = os.path.join(GDSC_FOLDER, 'methSampleId_2_cosmicIds.xlsx')
 GDSC_exp_data_FILE = os.path.join(GDSC_FOLDER, 'Cell_line_RMA_proc_basalExp.txt')
 GDSC_exp_sampleIds_FILE = os.path.join(GDSC_FOLDER, 'E-MTAB-3610.sdrf.txt')
 GDSC_mut_data_FILE = os.path.join(GDSC_FOLDER, 'mutations_all_20230202.csv')
 GDSC_SCREENING_DATA_FOLDER = os.path.join(GDSC_RAW_DATA_FOLDER, 'drug_screening_matrix_GDSC.tsv')
 CCLE_SCREENING_DATA_FOLDER = os.path.join(CCLE_RAW_DATA_FOLDER, 'drug_screening_matrix_ccle.tsv')
 BOTH_SCREENING_DATA_FOLDER = os.path.join(RAW_BOTH_DATA_FOLDER, 'drug_screening_matrix_gdsc_ctrp.tsv')

 CCLE_mut_data_FILE = os.path.join(CCLE_FOLDER, 'CCLE_mutations.csv')

 TABLE_RESULTS_FILE = os.path.join(DATA_FOLDER, 'drug_screening_table.tsv')
 MATRIX_RESULTS_FILE = os.path.join(DATA_FOLDER, 'drug_screening_matrix.tsv')

 MODEL_FILE = os.path.join(MODEL_FOLDER, 'trained_model_V1_EMDP.sav')
 TEST_FILE = os.path.join(TEST_DATA_FOLDER, 'test.gzip')
 RESULT_FILE = os.path.join(RES_DATA_FOLDER, 'result.tsv')

 TCGA_DATA_FOLDER = os.path.join(DATA_FOLDER, 'TCGA_test_data')
 TCGA_DATA_FOLDER = os.path.join(DATA_FOLDER, 'TCGA_data')
 TCGA_SCREENING_DATA = os.path.join(TCGA_DATA_FOLDER, 'TCGA_screening_matrix.tsv')

 BUILD_SIM_MATRICES = True  # Make this variable True to build similarity matrices from raw data
@@ -81,36 +31,4 @@ DATA_MODALITIES = ['cell_mut', 'drug_desc', 'drug_finger']
 RANDOM_SEED = 42  # Must be used wherever can be used


 def data_modalities_abbreviation():
    abb = []
    if 'cell_CN' in DATA_MODALITIES:
        abb.append('C')
    if 'cell_exp' in DATA_MODALITIES:
        abb.append('E')
    if 'cell_mut' in DATA_MODALITIES:
        abb.append('M')
    if 'cell_methy' in DATA_MODALITIES:
        abb.append('T')
    if 'drug_DT' in DATA_MODALITIES:
        abb.append('D')
    if 'drug_comp' in DATA_MODALITIES:
        abb.append('P')
    return ''.join(abb)


 """ TRAIN_INTEGRATION_METHOD used for each cell's and drug_data's data definitions: 
 SIMILARITY: A kernel based integration method in which based on the similarity of each cell's data with the training cell's
 data the input features for the multi layer perceptron (MLP) is constructed. The similarity function used could be different for
 each data modality (euclidean, jaccard,l1_norm, or ...)

 AUTO_ENCODER_V1: In this version of integrating multi-omics, for each data modality an autoencoder is trained to reduce the
 dimension of the features and finally a concatenation of each autoencoder's latent space builds up the input layer of the MLP.

 AUTO_ENCODER_V2: In this version of integrating multi-omics data, we train a big autoencoder which reduces the dimension of 
 all the different data modalities features at the same time to a smaller feature space. This version of integrating could
 take a lot of memory and time to integrate the data and might be computationally expensive.

 AUTO_ENCODER_V3: IN this version of integrating multi-omics data, we train an autoencoder for all the modalities kinda same as 
 the autoencoder version 2 but with this difference that the encoder and decoder layers are separate from each other and 
 just the latent layer is shared among different data modalities.
 """