Browse Source

feat: add main files

main
Taha Mohammadzadeh 11 months ago
parent
commit
ce5adb5616
6 changed files with 85 additions and 138 deletions
  1. 1
    5
      DeepDRA.py
  2. 7
    8
      data_loader.py
  3. 67
    32
      evaluation.py
  4. 3
    3
      main.py
  5. 0
    1
      mlp.py
  6. 7
    89
      utils.py

+ 1
- 5
DeepDRA.py View File

from autoencoder import Autoencoder from autoencoder import Autoencoder
from evaluation import Evaluation from evaluation import Evaluation
from mlp import MLP from mlp import MLP
from utils import *
from utils import MODEL_FOLDER




class DeepDRA(nn.Module): class DeepDRA(nn.Module):
# Apply reverse if specified # Apply reverse if specified
predictions = 1 - mlp_output if reverse else mlp_output predictions = 1 - mlp_output if reverse else mlp_output


# # Store predictions and ground truth labels
# all_predictions.extend(predictions.cpu().numpy())
# all_labels.extend(labels.cpu().numpy())

# Evaluate the predictions using the specified metrics # Evaluate the predictions using the specified metrics
result = Evaluation.evaluate(labels, predictions) result = Evaluation.evaluate(labels, predictions)



+ 7
- 8
data_loader.py View File

from utils import *
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.impute import SimpleImputer
import os import os


from utils import DRUG_DATA_FOLDER



class RawDataLoader: class RawDataLoader:
@staticmethod @staticmethod
# Step 5: Return the loaded data and adjusted drug screening data # Step 5: Return the loaded data and adjusted drug screening data
return data, drug_screen return data, drug_screen



@staticmethod @staticmethod
def intersect_features(data1, data2): def intersect_features(data1, data2):
""" """


return data[0] return data[0]



@staticmethod @staticmethod
def load_raw_files(raw_file_directory, data_modalities, intersect=True): def load_raw_files(raw_file_directory, data_modalities, intersect=True):
raw_dict = {} raw_dict = {}
df.columns = df.columns.str.replace('_cell_CN', '') df.columns = df.columns.str.replace('_cell_CN', '')
df.columns = df.columns.str.replace('_cell_exp', '') df.columns = df.columns.str.replace('_cell_exp', '')




# Note that drug_comp raw table has some NA values so we should impute it # Note that drug_comp raw table has some NA values so we should impute it
if any(df.isna()): if any(df.isna()):
df = pd.DataFrame(SimpleImputer(strategy='mean').fit_transform(df), df = pd.DataFrame(SimpleImputer(strategy='mean').fit_transform(df),
@staticmethod @staticmethod
def load_screening_files(filename="AUC_matS_comb.tsv", sep=',', ): def load_screening_files(filename="AUC_matS_comb.tsv", sep=',', ):
df = pd.read_csv(filename, sep=sep, index_col=0) df = pd.read_csv(filename, sep=sep, index_col=0)
# df = df.drop(['Erlotinib','17-AAG','PD-0325901','PHA-665752','PHA-665752','TAE684','Sorafenib','PLX4720','selumetinib','PD-0332991','Paclitaxel','Nilotinib','Saracatinib'],axis=1)
return df return df
# return pd.read_csv(os.path.join(DATA_FOLDER, "drug_screening_matrix_GDSC.tsv"), sep='\t', index_col=0)


@staticmethod @staticmethod
def adjust_screening_raw(drug_screen, data_dict): def adjust_screening_raw(drug_screen, data_dict):
raw_cell_names = [] raw_cell_names = []
raw_drug_names = []
for key, value in data_dict.items(): for key, value in data_dict.items():
if 'cell' in key: if 'cell' in key:
if len(raw_cell_names) == 0: if len(raw_cell_names) == 0:
screening_cell_names = drug_screen.index screening_cell_names = drug_screen.index
screening_drug_names = drug_screen.columns screening_drug_names = drug_screen.columns



common_cell_names = list(set(raw_cell_names).intersection(set(screening_cell_names))) common_cell_names = list(set(raw_cell_names).intersection(set(screening_cell_names)))
common_drug_names = list(set(raw_drug_names).intersection(set(screening_drug_names))) common_drug_names = list(set(raw_drug_names).intersection(set(screening_drug_names)))
for key, value in data_dict.items(): for key, value in data_dict.items():

+ 67
- 32
evaluation.py View File

from sklearn.metrics import roc_auc_score, average_precision_score, confusion_matrix, f1_score, precision_score, \ from sklearn.metrics import roc_auc_score, average_precision_score, confusion_matrix, f1_score, precision_score, \
recall_score, accuracy_score recall_score, accuracy_score
from matplotlib import pyplot as plt
from sklearn import metrics
import numpy as np
import pandas as pd
from statsmodels.stats.weightstats import ttest_ind


from utils import *




class Evaluation: class Evaluation:
@staticmethod @staticmethod
def plot_train_val_accuracy(train_accuracies, val_accuracies, num_epochs): def plot_train_val_accuracy(train_accuracies, val_accuracies, num_epochs):
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.title('h')
plt.plot(range(1, num_epochs + 1), train_accuracies)
plt.plot(range(1, num_epochs + 1), val_accuracies)
"""
Plot training and validation accuracies over epochs.

Parameters:
- train_accuracies (list): List of training accuracies.
- val_accuracies (list): List of validation accuracies.
- num_epochs (int): Number of training epochs.

Returns:
- None
"""
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Training and Validation Accuracies')
plt.plot(range(1, num_epochs + 1), train_accuracies, label='Train Accuracy')
plt.plot(range(1, num_epochs + 1), val_accuracies, label='Validation Accuracy')
plt.legend()
plt.show() plt.show()


@staticmethod @staticmethod
def plot_train_val_loss(train_loss, val_loss, num_epochs): def plot_train_val_loss(train_loss, val_loss, num_epochs):
plt.xlabel('epoch')
plt.ylabel('loss')
plt.title('h')
plt.plot(range(1, num_epochs + 1), train_loss)
plt.plot(range(1, num_epochs + 1), val_loss)
"""
Plot training and validation losses over epochs.

Parameters:
- train_loss (list): List of training losses.
- val_loss (list): List of validation losses.
- num_epochs (int): Number of training epochs.

Returns:
- None
"""
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Losses')
plt.plot(range(1, num_epochs + 1), train_loss, label='Train Loss')
plt.plot(range(1, num_epochs + 1), val_loss, label='Validation Loss')
plt.legend()
plt.show() plt.show()



@staticmethod @staticmethod
def evaluate(all_targets, mlp_output, show_plot=True): def evaluate(all_targets, mlp_output, show_plot=True):
"""
Evaluate model performance based on predictions and targets.

Parameters:
- all_targets (numpy.ndarray): True target labels.
- mlp_output (numpy.ndarray): Predicted probabilities.
- show_plot (bool): Whether to display ROC and PR curves.

Returns:
- results (dict): Dictionary containing evaluation metrics.
"""
# Step 1: Convert predicted probabilities to binary labels
predicted_labels = np.where(mlp_output > 0.5, 1, 0) predicted_labels = np.where(mlp_output > 0.5, 1, 0)
# Collect predictions and targets for later evaluation
predicted_labels = predicted_labels.reshape(-1) predicted_labels = predicted_labels.reshape(-1)
# Convert predictions and targets to numpy arrays
all_predictions = predicted_labels all_predictions = predicted_labels


# Calculate and print AUC
# Step 2: Calculate and print AUC
fpr, tpr, thresholds = metrics.roc_curve(all_targets, mlp_output) fpr, tpr, thresholds = metrics.roc_curve(all_targets, mlp_output)
auc = np.round(metrics.auc(fpr, tpr), 2) auc = np.round(metrics.auc(fpr, tpr), 2)


# Calculate and print AUPRC
print(all_targets)
# Step 3: Calculate and print AUPRC
precision, recall, thresholds = metrics.precision_recall_curve(all_targets, mlp_output) precision, recall, thresholds = metrics.precision_recall_curve(all_targets, mlp_output)
auprc = np.round(metrics.auc(recall, precision), 2) auprc = np.round(metrics.auc(recall, precision), 2)
# auprc = average_precision_score(all_targets, mlp_output)

print('Accuracy: {:.2f}'.format(np.round(accuracy_score(all_targets, all_predictions), 2)))
print('AUC: {:.2f}'.format(auc))
print('AUPRC: {:.2f}'.format(auprc))


# Calculate and print confusion matrix
# Step 4: Print accuracy, AUC, AUPRC, and confusion matrix
accuracy = accuracy_score(all_targets, all_predictions)
cm = confusion_matrix(all_targets, all_predictions) cm = confusion_matrix(all_targets, all_predictions)
accuracy = cm.trace() / np.sum(cm)
precision = cm[0, 0] / (cm[0, 0] + cm[0, 1]) precision = cm[0, 0] / (cm[0, 0] + cm[0, 1])
recall = cm[0, 0] / (cm[0, 0] + cm[1, 0]) recall = cm[0, 0] / (cm[0, 0] + cm[1, 0])
f1_score = 2 * precision * recall / (precision + recall) f1_score = 2 * precision * recall / (precision + recall)
print('Confusion matrix:\n', cm, sep='')
print(f'Accuracy: {accuracy:.3f}, Precision: {precision:.3f}, Recall: {recall:.3f}, F1 score: {f1_score:.3f}')
print(f'Accuracy: {accuracy:.2f}')
print(f'AUC: {auc:.2f}')
print(f'AUPRC: {auprc:.2f}')
print(f'Confusion matrix:\n{cm}')
print(f'Precision: {precision:.3f}, Recall: {recall:.3f}, F1 score: {f1_score:.3f}')


# Step 5: Display ROC and PR curves if requested
if show_plot: if show_plot:
plt.xlabel('False Positive Rate') plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate') plt.ylabel('True Positive Rate')
plt.title(f'ROC Curve: AUC={auc}') plt.title(f'ROC Curve: AUC={auc}')
plt.plot(fpr, tpr) plt.plot(fpr, tpr)
plt.show() plt.show()
# print(f'AUC: {auc}')


plt.xlabel('Recall') plt.xlabel('Recall')
plt.ylabel('Precision') plt.ylabel('Precision')
plt.plot(recall, precision) plt.plot(recall, precision)
plt.show() plt.show()


# Violin plot for DeepDRA scores
prediction_targets = pd.DataFrame({}, columns=['Prediction', 'Target']) prediction_targets = pd.DataFrame({}, columns=['Prediction', 'Target'])


res = pd.concat( res = pd.concat(


res.columns = prediction_targets.columns res.columns = prediction_targets.columns
prediction_targets = pd.concat([prediction_targets, res]) prediction_targets = pd.concat([prediction_targets, res])

class_one = prediction_targets.loc[prediction_targets['Target'] == 0, 'Prediction'].astype(
np.float32).tolist()
class_minus_one = prediction_targets.loc[prediction_targets['Target'] == 1, 'Prediction'].astype(
np.float32).tolist()
class_one = prediction_targets.loc[prediction_targets['Target'] == 0, 'Prediction']
class_minus_one = prediction_targets.loc[prediction_targets['Target'] == 1, 'Prediction']


fig, ax = plt.subplots() fig, ax = plt.subplots()
ax.set_ylabel("DeepDRA score") ax.set_ylabel("DeepDRA score")
p_value = np.format_float_scientific(ttest_ind(class_one, class_minus_one)[1]) p_value = np.format_float_scientific(ttest_ind(class_one, class_minus_one)[1])
cancer = 'all' cancer = 'all'
plt.title( plt.title(
f'Responder/Non responder scores for {cancer} cancer with \np-value ~= {p_value[0]}e{p_value[-3:]} ')
f'Responder/Non-responder scores for {cancer} cancer with \np-value ~= {p_value[0]}e{p_value[-3:]} ')
bp = ax.violinplot(data_to_plot, showextrema=True, showmeans=True, showmedians=True) bp = ax.violinplot(data_to_plot, showextrema=True, showmeans=True, showmedians=True)
bp['cmeans'].set_color('r') bp['cmeans'].set_color('r')
bp['cmedians'].set_color('g') bp['cmedians'].set_color('g')
plt.show() plt.show()


# Step 6: Return evaluation metrics in a dictionary
return {'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1 score': f1_score, 'AUC': auc, return {'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1 score': f1_score, 'AUC': auc,
'AUPRC': auprc} 'AUPRC': auprc}



+ 3
- 3
main.py View File

from data_loader import RawDataLoader from data_loader import RawDataLoader
from evaluation import Evaluation from evaluation import Evaluation
from utils import * from utils import *
from mlp import MLP
import random import random
import torch import torch
import numpy as np import numpy as np
import pandas as pd




def train_DeepDRA(x_cell_train, x_cell_test, x_drug_train, x_drug_test, y_train, y_test, cell_sizes, drug_sizes): def train_DeepDRA(x_cell_train, x_cell_test, x_drug_train, x_drug_test, y_train, y_test, cell_sizes, drug_sizes):
train(model, train_loader, num_epochs=num_epochs) train(model, train_loader, num_epochs=num_epochs)


# Step 7: Save the trained model # Step 7: Save the trained model
torch.save(model, 'DeepDRA.pth')
torch.save(model, MODEL_FOLDER + 'DeepDRA.pth')


# Step 8: Load the saved model # Step 8: Load the saved model
model = torch.load('DeepDRA.pth')
model = torch.load( MODEL_FOLDER + 'DeepDRA.pth')


# Step 9: Convert your test data to PyTorch tensors # Step 9: Convert your test data to PyTorch tensors
x_cell_test_tensor = torch.Tensor(x_cell_test.values) x_cell_test_tensor = torch.Tensor(x_cell_test.values)

+ 0
- 1
mlp.py View File

from torch import optim, no_grad from torch import optim, no_grad
import torch import torch
from evaluation import Evaluation from evaluation import Evaluation
from utils import data_modalities_abbreviation
class EarlyStopper: class EarlyStopper:
def __init__(self, patience=1, min_delta=0): def __init__(self, patience=1, min_delta=0):
self.patience = patience self.patience = patience

+ 7
- 89
utils.py View File

import os import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import sklearn as sk
from matplotlib import pyplot as plt
from scipy.spatial.distance import pdist, squareform
import h2o
from h2o.estimators import H2ODeepLearningEstimator
from sklearn.impute import SimpleImputer
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import pickle
from sklearn import metrics
from copy import deepcopy
import pyreadr
import requests
from time import time
from math import ceil
from statsmodels.stats.weightstats import ttest_ind
import torch.optim.lr_scheduler as lr_scheduler
from sklearn.model_selection import KFold



DATA_FOLDER = 'data' DATA_FOLDER = 'data'
RES_DATA_FOLDER = os.path.join(DATA_FOLDER, 'res')
TEST_DATA_FOLDER = os.path.join(DATA_FOLDER, 'final_test_data')
TEST_TCGA_DATA_FOLDER = os.path.join(DATA_FOLDER, 'TCGA_test_data') TEST_TCGA_DATA_FOLDER = os.path.join(DATA_FOLDER, 'TCGA_test_data')
SIM_DATA_FOLDER = os.path.join(DATA_FOLDER, 'similarity_data')
RAW_DATA_FOLDER = os.path.join(DATA_FOLDER, 'raw_data')
RAW_BOTH_DATA_FOLDER = os.path.join(DATA_FOLDER, 'CTRP_GDSC_Data')
RAW_BOTH_DATA_FOLDER = os.path.join(DATA_FOLDER, 'CTRP_GDSC_data')
DRUG_DATA_FOLDER = os.path.join(DATA_FOLDER, 'drug_data') DRUG_DATA_FOLDER = os.path.join(DATA_FOLDER, 'drug_data')

NEW_RAW_DATA_FOLDER = os.path.join(DATA_FOLDER, 'new_raw_data')
GDSC_RAW_DATA_FOLDER = os.path.join(DATA_FOLDER, 'GDSC_data') GDSC_RAW_DATA_FOLDER = os.path.join(DATA_FOLDER, 'GDSC_data')
CCLE_RAW_DATA_FOLDER = os.path.join(DATA_FOLDER, 'CCLE_raw')
CCLE_RAW_DATA_FOLDER = os.path.join(DATA_FOLDER, 'CCLE_data')
GDSC_SCREENING_DATA_FOLDER = os.path.join(GDSC_RAW_DATA_FOLDER, 'drug_screening_matrix_GDSC.tsv')
CCLE_SCREENING_DATA_FOLDER = os.path.join(CCLE_RAW_DATA_FOLDER, 'drug_screening_matrix_ccle.tsv')
BOTH_SCREENING_DATA_FOLDER = os.path.join(RAW_BOTH_DATA_FOLDER, 'drug_screening_matrix_gdsc_ctrp.tsv')


CTRP_FOLDER = os.path.join(DATA_FOLDER, 'CTRP') CTRP_FOLDER = os.path.join(DATA_FOLDER, 'CTRP')
GDSC_FOLDER = os.path.join(DATA_FOLDER, 'GDSC') GDSC_FOLDER = os.path.join(DATA_FOLDER, 'GDSC')


MODEL_FOLDER = os.path.join(DATA_FOLDER, 'model') MODEL_FOLDER = os.path.join(DATA_FOLDER, 'model')


CTRP_EXPERIMENT_FILE = os.path.join(CTRP_FOLDER, 'v20.meta.per_experiment.txt')
CTRP_COMPOUND_FILE = os.path.join(CTRP_FOLDER, 'v20.meta.per_compound.txt')
CTRP_CELLLINE_FILE = os.path.join(CTRP_FOLDER, 'v20.meta.per_cell_line.txt')
CTRP_AUC_FILE = os.path.join(CTRP_FOLDER, 'v20.data.curves_post_qc.txt')

GDSC_AUC_FILE = os.path.join(GDSC_FOLDER, 'GDSC2_fitted_dose_response.csv')
GDSC_cnv_data_FILE = os.path.join(GDSC_FOLDER, 'cnv_abs_copy_number_picnic_20191101.csv')
GDSC_methy_data_FILE = os.path.join(GDSC_FOLDER, 'F2_METH_CELL_DATA.txt')
GDSC_methy_sampleIds_FILE = os.path.join(GDSC_FOLDER, 'methSampleId_2_cosmicIds.xlsx')
GDSC_exp_data_FILE = os.path.join(GDSC_FOLDER, 'Cell_line_RMA_proc_basalExp.txt')
GDSC_exp_sampleIds_FILE = os.path.join(GDSC_FOLDER, 'E-MTAB-3610.sdrf.txt')
GDSC_mut_data_FILE = os.path.join(GDSC_FOLDER, 'mutations_all_20230202.csv')
GDSC_SCREENING_DATA_FOLDER = os.path.join(GDSC_RAW_DATA_FOLDER, 'drug_screening_matrix_GDSC.tsv')
CCLE_SCREENING_DATA_FOLDER = os.path.join(CCLE_RAW_DATA_FOLDER, 'drug_screening_matrix_ccle.tsv')
BOTH_SCREENING_DATA_FOLDER = os.path.join(RAW_BOTH_DATA_FOLDER, 'drug_screening_matrix_gdsc_ctrp.tsv')

CCLE_mut_data_FILE = os.path.join(CCLE_FOLDER, 'CCLE_mutations.csv')

TABLE_RESULTS_FILE = os.path.join(DATA_FOLDER, 'drug_screening_table.tsv')
MATRIX_RESULTS_FILE = os.path.join(DATA_FOLDER, 'drug_screening_matrix.tsv')

MODEL_FILE = os.path.join(MODEL_FOLDER, 'trained_model_V1_EMDP.sav')
TEST_FILE = os.path.join(TEST_DATA_FOLDER, 'test.gzip')
RESULT_FILE = os.path.join(RES_DATA_FOLDER, 'result.tsv')

TCGA_DATA_FOLDER = os.path.join(DATA_FOLDER, 'TCGA_test_data')
TCGA_DATA_FOLDER = os.path.join(DATA_FOLDER, 'TCGA_data')
TCGA_SCREENING_DATA = os.path.join(TCGA_DATA_FOLDER, 'TCGA_screening_matrix.tsv') TCGA_SCREENING_DATA = os.path.join(TCGA_DATA_FOLDER, 'TCGA_screening_matrix.tsv')


BUILD_SIM_MATRICES = True # Make this variable True to build similarity matrices from raw data BUILD_SIM_MATRICES = True # Make this variable True to build similarity matrices from raw data
RANDOM_SEED = 42 # Must be used wherever can be used RANDOM_SEED = 42 # Must be used wherever can be used




def data_modalities_abbreviation():
abb = []
if 'cell_CN' in DATA_MODALITIES:
abb.append('C')
if 'cell_exp' in DATA_MODALITIES:
abb.append('E')
if 'cell_mut' in DATA_MODALITIES:
abb.append('M')
if 'cell_methy' in DATA_MODALITIES:
abb.append('T')
if 'drug_DT' in DATA_MODALITIES:
abb.append('D')
if 'drug_comp' in DATA_MODALITIES:
abb.append('P')
return ''.join(abb)


""" TRAIN_INTEGRATION_METHOD used for each cell's and drug_data's data definitions:
SIMILARITY: A kernel based integration method in which based on the similarity of each cell's data with the training cell's
data the input features for the multi layer perceptron (MLP) is constructed. The similarity function used could be different for
each data modality (euclidean, jaccard,l1_norm, or ...)

AUTO_ENCODER_V1: In this version of integrating multi-omics, for each data modality an autoencoder is trained to reduce the
dimension of the features and finally a concatenation of each autoencoder's latent space builds up the input layer of the MLP.

AUTO_ENCODER_V2: In this version of integrating multi-omics data, we train a big autoencoder which reduces the dimension of
all the different data modalities features at the same time to a smaller feature space. This version of integrating could
take a lot of memory and time to integrate the data and might be computationally expensive.


AUTO_ENCODER_V3: IN this version of integrating multi-omics data, we train an autoencoder for all the modalities kinda same as
the autoencoder version 2 but with this difference that the encoder and decoder layers are separate from each other and
just the latent layer is shared among different data modalities.
"""

Loading…
Cancel
Save