@@ -5,7 +5,7 @@ from torch.optim import lr_scheduler | |||
from autoencoder import Autoencoder | |||
from evaluation import Evaluation | |||
from mlp import MLP | |||
from utils import * | |||
from utils import MODEL_FOLDER | |||
class DeepDRA(nn.Module): | |||
@@ -162,10 +162,6 @@ def test(model, test_loader, reverse=False): | |||
# Apply reverse if specified | |||
predictions = 1 - mlp_output if reverse else mlp_output | |||
# # Store predictions and ground truth labels | |||
# all_predictions.extend(predictions.cpu().numpy()) | |||
# all_labels.extend(labels.cpu().numpy()) | |||
# Evaluate the predictions using the specified metrics | |||
result = Evaluation.evaluate(labels, predictions) | |||
@@ -1,6 +1,11 @@ | |||
from utils import * | |||
import pandas as pd | |||
import numpy as np | |||
from tqdm import tqdm | |||
from sklearn.impute import SimpleImputer | |||
import os | |||
from utils import DRUG_DATA_FOLDER | |||
class RawDataLoader: | |||
@staticmethod | |||
@@ -39,7 +44,6 @@ class RawDataLoader: | |||
# Step 5: Return the loaded data and adjusted drug screening data | |||
return data, drug_screen | |||
@staticmethod | |||
def intersect_features(data1, data2): | |||
""" | |||
@@ -109,7 +113,6 @@ class RawDataLoader: | |||
return data[0] | |||
@staticmethod | |||
def load_raw_files(raw_file_directory, data_modalities, intersect=True): | |||
raw_dict = {} | |||
@@ -132,8 +135,6 @@ class RawDataLoader: | |||
df.columns = df.columns.str.replace('_cell_CN', '') | |||
df.columns = df.columns.str.replace('_cell_exp', '') | |||
# Note that drug_comp raw table has some NA values so we should impute it | |||
if any(df.isna()): | |||
df = pd.DataFrame(SimpleImputer(strategy='mean').fit_transform(df), | |||
@@ -168,13 +169,12 @@ class RawDataLoader: | |||
@staticmethod | |||
def load_screening_files(filename="AUC_matS_comb.tsv", sep=',', ): | |||
df = pd.read_csv(filename, sep=sep, index_col=0) | |||
# df = df.drop(['Erlotinib','17-AAG','PD-0325901','PHA-665752','PHA-665752','TAE684','Sorafenib','PLX4720','selumetinib','PD-0332991','Paclitaxel','Nilotinib','Saracatinib'],axis=1) | |||
return df | |||
# return pd.read_csv(os.path.join(DATA_FOLDER, "drug_screening_matrix_GDSC.tsv"), sep='\t', index_col=0) | |||
@staticmethod | |||
def adjust_screening_raw(drug_screen, data_dict): | |||
raw_cell_names = [] | |||
raw_drug_names = [] | |||
for key, value in data_dict.items(): | |||
if 'cell' in key: | |||
if len(raw_cell_names) == 0: | |||
@@ -187,7 +187,6 @@ class RawDataLoader: | |||
screening_cell_names = drug_screen.index | |||
screening_drug_names = drug_screen.columns | |||
common_cell_names = list(set(raw_cell_names).intersection(set(screening_cell_names))) | |||
common_drug_names = list(set(raw_drug_names).intersection(set(screening_drug_names))) | |||
for key, value in data_dict.items(): |
@@ -1,66 +1,102 @@ | |||
from sklearn.metrics import roc_auc_score, average_precision_score, confusion_matrix, f1_score, precision_score, \ | |||
recall_score, accuracy_score | |||
from matplotlib import pyplot as plt | |||
from sklearn import metrics | |||
import numpy as np | |||
import pandas as pd | |||
from statsmodels.stats.weightstats import ttest_ind | |||
from utils import * | |||
class Evaluation: | |||
@staticmethod | |||
def plot_train_val_accuracy(train_accuracies, val_accuracies, num_epochs): | |||
plt.xlabel('epoch') | |||
plt.ylabel('accuracy') | |||
plt.title('h') | |||
plt.plot(range(1, num_epochs + 1), train_accuracies) | |||
plt.plot(range(1, num_epochs + 1), val_accuracies) | |||
""" | |||
Plot training and validation accuracies over epochs. | |||
Parameters: | |||
- train_accuracies (list): List of training accuracies. | |||
- val_accuracies (list): List of validation accuracies. | |||
- num_epochs (int): Number of training epochs. | |||
Returns: | |||
- None | |||
""" | |||
plt.xlabel('Epoch') | |||
plt.ylabel('Accuracy') | |||
plt.title('Training and Validation Accuracies') | |||
plt.plot(range(1, num_epochs + 1), train_accuracies, label='Train Accuracy') | |||
plt.plot(range(1, num_epochs + 1), val_accuracies, label='Validation Accuracy') | |||
plt.legend() | |||
plt.show() | |||
@staticmethod | |||
def plot_train_val_loss(train_loss, val_loss, num_epochs): | |||
plt.xlabel('epoch') | |||
plt.ylabel('loss') | |||
plt.title('h') | |||
plt.plot(range(1, num_epochs + 1), train_loss) | |||
plt.plot(range(1, num_epochs + 1), val_loss) | |||
""" | |||
Plot training and validation losses over epochs. | |||
Parameters: | |||
- train_loss (list): List of training losses. | |||
- val_loss (list): List of validation losses. | |||
- num_epochs (int): Number of training epochs. | |||
Returns: | |||
- None | |||
""" | |||
plt.xlabel('Epoch') | |||
plt.ylabel('Loss') | |||
plt.title('Training and Validation Losses') | |||
plt.plot(range(1, num_epochs + 1), train_loss, label='Train Loss') | |||
plt.plot(range(1, num_epochs + 1), val_loss, label='Validation Loss') | |||
plt.legend() | |||
plt.show() | |||
@staticmethod | |||
def evaluate(all_targets, mlp_output, show_plot=True): | |||
""" | |||
Evaluate model performance based on predictions and targets. | |||
Parameters: | |||
- all_targets (numpy.ndarray): True target labels. | |||
- mlp_output (numpy.ndarray): Predicted probabilities. | |||
- show_plot (bool): Whether to display ROC and PR curves. | |||
Returns: | |||
- results (dict): Dictionary containing evaluation metrics. | |||
""" | |||
# Step 1: Convert predicted probabilities to binary labels | |||
predicted_labels = np.where(mlp_output > 0.5, 1, 0) | |||
# Collect predictions and targets for later evaluation | |||
predicted_labels = predicted_labels.reshape(-1) | |||
# Convert predictions and targets to numpy arrays | |||
all_predictions = predicted_labels | |||
# Calculate and print AUC | |||
# Step 2: Calculate and print AUC | |||
fpr, tpr, thresholds = metrics.roc_curve(all_targets, mlp_output) | |||
auc = np.round(metrics.auc(fpr, tpr), 2) | |||
# Calculate and print AUPRC | |||
print(all_targets) | |||
# Step 3: Calculate and print AUPRC | |||
precision, recall, thresholds = metrics.precision_recall_curve(all_targets, mlp_output) | |||
auprc = np.round(metrics.auc(recall, precision), 2) | |||
# auprc = average_precision_score(all_targets, mlp_output) | |||
print('Accuracy: {:.2f}'.format(np.round(accuracy_score(all_targets, all_predictions), 2))) | |||
print('AUC: {:.2f}'.format(auc)) | |||
print('AUPRC: {:.2f}'.format(auprc)) | |||
# Calculate and print confusion matrix | |||
# Step 4: Print accuracy, AUC, AUPRC, and confusion matrix | |||
accuracy = accuracy_score(all_targets, all_predictions) | |||
cm = confusion_matrix(all_targets, all_predictions) | |||
accuracy = cm.trace() / np.sum(cm) | |||
precision = cm[0, 0] / (cm[0, 0] + cm[0, 1]) | |||
recall = cm[0, 0] / (cm[0, 0] + cm[1, 0]) | |||
f1_score = 2 * precision * recall / (precision + recall) | |||
print('Confusion matrix:\n', cm, sep='') | |||
print(f'Accuracy: {accuracy:.3f}, Precision: {precision:.3f}, Recall: {recall:.3f}, F1 score: {f1_score:.3f}') | |||
print(f'Accuracy: {accuracy:.2f}') | |||
print(f'AUC: {auc:.2f}') | |||
print(f'AUPRC: {auprc:.2f}') | |||
print(f'Confusion matrix:\n{cm}') | |||
print(f'Precision: {precision:.3f}, Recall: {recall:.3f}, F1 score: {f1_score:.3f}') | |||
# Step 5: Display ROC and PR curves if requested | |||
if show_plot: | |||
plt.xlabel('False Positive Rate') | |||
plt.ylabel('True Positive Rate') | |||
plt.title(f'ROC Curve: AUC={auc}') | |||
plt.plot(fpr, tpr) | |||
plt.show() | |||
# print(f'AUC: {auc}') | |||
plt.xlabel('Recall') | |||
plt.ylabel('Precision') | |||
@@ -68,6 +104,7 @@ class Evaluation: | |||
plt.plot(recall, precision) | |||
plt.show() | |||
# Violin plot for DeepDRA scores | |||
prediction_targets = pd.DataFrame({}, columns=['Prediction', 'Target']) | |||
res = pd.concat( | |||
@@ -76,11 +113,8 @@ class Evaluation: | |||
res.columns = prediction_targets.columns | |||
prediction_targets = pd.concat([prediction_targets, res]) | |||
class_one = prediction_targets.loc[prediction_targets['Target'] == 0, 'Prediction'].astype( | |||
np.float32).tolist() | |||
class_minus_one = prediction_targets.loc[prediction_targets['Target'] == 1, 'Prediction'].astype( | |||
np.float32).tolist() | |||
class_one = prediction_targets.loc[prediction_targets['Target'] == 0, 'Prediction'] | |||
class_minus_one = prediction_targets.loc[prediction_targets['Target'] == 1, 'Prediction'] | |||
fig, ax = plt.subplots() | |||
ax.set_ylabel("DeepDRA score") | |||
@@ -92,12 +126,13 @@ class Evaluation: | |||
p_value = np.format_float_scientific(ttest_ind(class_one, class_minus_one)[1]) | |||
cancer = 'all' | |||
plt.title( | |||
f'Responder/Non responder scores for {cancer} cancer with \np-value ~= {p_value[0]}e{p_value[-3:]} ') | |||
f'Responder/Non-responder scores for {cancer} cancer with \np-value ~= {p_value[0]}e{p_value[-3:]} ') | |||
bp = ax.violinplot(data_to_plot, showextrema=True, showmeans=True, showmedians=True) | |||
bp['cmeans'].set_color('r') | |||
bp['cmedians'].set_color('g') | |||
plt.show() | |||
# Step 6: Return evaluation metrics in a dictionary | |||
return {'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1 score': f1_score, 'AUC': auc, | |||
'AUPRC': auprc} | |||
@@ -7,10 +7,10 @@ from DeepDRA import DeepDRA, train, test | |||
from data_loader import RawDataLoader | |||
from evaluation import Evaluation | |||
from utils import * | |||
from mlp import MLP | |||
import random | |||
import torch | |||
import numpy as np | |||
import pandas as pd | |||
def train_DeepDRA(x_cell_train, x_cell_test, x_drug_train, x_drug_test, y_train, y_test, cell_sizes, drug_sizes): | |||
@@ -60,10 +60,10 @@ def train_DeepDRA(x_cell_train, x_cell_test, x_drug_train, x_drug_test, y_train, | |||
train(model, train_loader, num_epochs=num_epochs) | |||
# Step 7: Save the trained model | |||
torch.save(model, 'DeepDRA.pth') | |||
torch.save(model, MODEL_FOLDER + 'DeepDRA.pth') | |||
# Step 8: Load the saved model | |||
model = torch.load('DeepDRA.pth') | |||
model = torch.load( MODEL_FOLDER + 'DeepDRA.pth') | |||
# Step 9: Convert your test data to PyTorch tensors | |||
x_cell_test_tensor = torch.Tensor(x_cell_test.values) |
@@ -2,7 +2,6 @@ from torch import nn | |||
from torch import optim, no_grad | |||
import torch | |||
from evaluation import Evaluation | |||
from utils import data_modalities_abbreviation | |||
class EarlyStopper: | |||
def __init__(self, patience=1, min_delta=0): | |||
self.patience = patience |
@@ -1,40 +1,15 @@ | |||
import os | |||
import pandas as pd | |||
import numpy as np | |||
from tqdm import tqdm | |||
import sklearn as sk | |||
from matplotlib import pyplot as plt | |||
from scipy.spatial.distance import pdist, squareform | |||
import h2o | |||
from h2o.estimators import H2ODeepLearningEstimator | |||
from sklearn.impute import SimpleImputer | |||
import torch | |||
import torch.nn as nn | |||
import torch.optim as optim | |||
from torch.utils.data import DataLoader, TensorDataset | |||
import pickle | |||
from sklearn import metrics | |||
from copy import deepcopy | |||
import pyreadr | |||
import requests | |||
from time import time | |||
from math import ceil | |||
from statsmodels.stats.weightstats import ttest_ind | |||
import torch.optim.lr_scheduler as lr_scheduler | |||
from sklearn.model_selection import KFold | |||
DATA_FOLDER = 'data' | |||
RES_DATA_FOLDER = os.path.join(DATA_FOLDER, 'res') | |||
TEST_DATA_FOLDER = os.path.join(DATA_FOLDER, 'final_test_data') | |||
TEST_TCGA_DATA_FOLDER = os.path.join(DATA_FOLDER, 'TCGA_test_data') | |||
SIM_DATA_FOLDER = os.path.join(DATA_FOLDER, 'similarity_data') | |||
RAW_DATA_FOLDER = os.path.join(DATA_FOLDER, 'raw_data') | |||
RAW_BOTH_DATA_FOLDER = os.path.join(DATA_FOLDER, 'CTRP_GDSC_Data') | |||
RAW_BOTH_DATA_FOLDER = os.path.join(DATA_FOLDER, 'CTRP_GDSC_data') | |||
DRUG_DATA_FOLDER = os.path.join(DATA_FOLDER, 'drug_data') | |||
NEW_RAW_DATA_FOLDER = os.path.join(DATA_FOLDER, 'new_raw_data') | |||
GDSC_RAW_DATA_FOLDER = os.path.join(DATA_FOLDER, 'GDSC_data') | |||
CCLE_RAW_DATA_FOLDER = os.path.join(DATA_FOLDER, 'CCLE_raw') | |||
CCLE_RAW_DATA_FOLDER = os.path.join(DATA_FOLDER, 'CCLE_data') | |||
GDSC_SCREENING_DATA_FOLDER = os.path.join(GDSC_RAW_DATA_FOLDER, 'drug_screening_matrix_GDSC.tsv') | |||
CCLE_SCREENING_DATA_FOLDER = os.path.join(CCLE_RAW_DATA_FOLDER, 'drug_screening_matrix_ccle.tsv') | |||
BOTH_SCREENING_DATA_FOLDER = os.path.join(RAW_BOTH_DATA_FOLDER, 'drug_screening_matrix_gdsc_ctrp.tsv') | |||
CTRP_FOLDER = os.path.join(DATA_FOLDER, 'CTRP') | |||
GDSC_FOLDER = os.path.join(DATA_FOLDER, 'GDSC') | |||
@@ -42,32 +17,7 @@ CCLE_FOLDER = os.path.join(DATA_FOLDER, 'CCLE') | |||
MODEL_FOLDER = os.path.join(DATA_FOLDER, 'model') | |||
CTRP_EXPERIMENT_FILE = os.path.join(CTRP_FOLDER, 'v20.meta.per_experiment.txt') | |||
CTRP_COMPOUND_FILE = os.path.join(CTRP_FOLDER, 'v20.meta.per_compound.txt') | |||
CTRP_CELLLINE_FILE = os.path.join(CTRP_FOLDER, 'v20.meta.per_cell_line.txt') | |||
CTRP_AUC_FILE = os.path.join(CTRP_FOLDER, 'v20.data.curves_post_qc.txt') | |||
GDSC_AUC_FILE = os.path.join(GDSC_FOLDER, 'GDSC2_fitted_dose_response.csv') | |||
GDSC_cnv_data_FILE = os.path.join(GDSC_FOLDER, 'cnv_abs_copy_number_picnic_20191101.csv') | |||
GDSC_methy_data_FILE = os.path.join(GDSC_FOLDER, 'F2_METH_CELL_DATA.txt') | |||
GDSC_methy_sampleIds_FILE = os.path.join(GDSC_FOLDER, 'methSampleId_2_cosmicIds.xlsx') | |||
GDSC_exp_data_FILE = os.path.join(GDSC_FOLDER, 'Cell_line_RMA_proc_basalExp.txt') | |||
GDSC_exp_sampleIds_FILE = os.path.join(GDSC_FOLDER, 'E-MTAB-3610.sdrf.txt') | |||
GDSC_mut_data_FILE = os.path.join(GDSC_FOLDER, 'mutations_all_20230202.csv') | |||
GDSC_SCREENING_DATA_FOLDER = os.path.join(GDSC_RAW_DATA_FOLDER, 'drug_screening_matrix_GDSC.tsv') | |||
CCLE_SCREENING_DATA_FOLDER = os.path.join(CCLE_RAW_DATA_FOLDER, 'drug_screening_matrix_ccle.tsv') | |||
BOTH_SCREENING_DATA_FOLDER = os.path.join(RAW_BOTH_DATA_FOLDER, 'drug_screening_matrix_gdsc_ctrp.tsv') | |||
CCLE_mut_data_FILE = os.path.join(CCLE_FOLDER, 'CCLE_mutations.csv') | |||
TABLE_RESULTS_FILE = os.path.join(DATA_FOLDER, 'drug_screening_table.tsv') | |||
MATRIX_RESULTS_FILE = os.path.join(DATA_FOLDER, 'drug_screening_matrix.tsv') | |||
MODEL_FILE = os.path.join(MODEL_FOLDER, 'trained_model_V1_EMDP.sav') | |||
TEST_FILE = os.path.join(TEST_DATA_FOLDER, 'test.gzip') | |||
RESULT_FILE = os.path.join(RES_DATA_FOLDER, 'result.tsv') | |||
TCGA_DATA_FOLDER = os.path.join(DATA_FOLDER, 'TCGA_test_data') | |||
TCGA_DATA_FOLDER = os.path.join(DATA_FOLDER, 'TCGA_data') | |||
TCGA_SCREENING_DATA = os.path.join(TCGA_DATA_FOLDER, 'TCGA_screening_matrix.tsv') | |||
BUILD_SIM_MATRICES = True # Make this variable True to build similarity matrices from raw data | |||
@@ -81,36 +31,4 @@ DATA_MODALITIES = ['cell_mut', 'drug_desc', 'drug_finger'] | |||
RANDOM_SEED = 42 # Must be used wherever can be used | |||
def data_modalities_abbreviation(): | |||
abb = [] | |||
if 'cell_CN' in DATA_MODALITIES: | |||
abb.append('C') | |||
if 'cell_exp' in DATA_MODALITIES: | |||
abb.append('E') | |||
if 'cell_mut' in DATA_MODALITIES: | |||
abb.append('M') | |||
if 'cell_methy' in DATA_MODALITIES: | |||
abb.append('T') | |||
if 'drug_DT' in DATA_MODALITIES: | |||
abb.append('D') | |||
if 'drug_comp' in DATA_MODALITIES: | |||
abb.append('P') | |||
return ''.join(abb) | |||
""" TRAIN_INTEGRATION_METHOD used for each cell's and drug_data's data definitions: | |||
SIMILARITY: A kernel based integration method in which based on the similarity of each cell's data with the training cell's | |||
data the input features for the multi layer perceptron (MLP) is constructed. The similarity function used could be different for | |||
each data modality (euclidean, jaccard,l1_norm, or ...) | |||
AUTO_ENCODER_V1: In this version of integrating multi-omics, for each data modality an autoencoder is trained to reduce the | |||
dimension of the features and finally a concatenation of each autoencoder's latent space builds up the input layer of the MLP. | |||
AUTO_ENCODER_V2: In this version of integrating multi-omics data, we train a big autoencoder which reduces the dimension of | |||
all the different data modalities features at the same time to a smaller feature space. This version of integrating could | |||
take a lot of memory and time to integrate the data and might be computationally expensive. | |||
AUTO_ENCODER_V3: IN this version of integrating multi-omics data, we train an autoencoder for all the modalities kinda same as | |||
the autoencoder version 2 but with this difference that the encoder and decoder layers are separate from each other and | |||
just the latent layer is shared among different data modalities. | |||
""" |