Browse Source

feat: add main files

main
Taha Mohammadzadeh 1 year ago
parent
commit
ce5adb5616
6 changed files with 85 additions and 138 deletions
  1. 1
    5
      DeepDRA.py
  2. 7
    8
      data_loader.py
  3. 67
    32
      evaluation.py
  4. 3
    3
      main.py
  5. 0
    1
      mlp.py
  6. 7
    89
      utils.py

+ 1
- 5
DeepDRA.py View File

@@ -5,7 +5,7 @@ from torch.optim import lr_scheduler
from autoencoder import Autoencoder
from evaluation import Evaluation
from mlp import MLP
from utils import *
from utils import MODEL_FOLDER


class DeepDRA(nn.Module):
@@ -162,10 +162,6 @@ def test(model, test_loader, reverse=False):
# Apply reverse if specified
predictions = 1 - mlp_output if reverse else mlp_output

# # Store predictions and ground truth labels
# all_predictions.extend(predictions.cpu().numpy())
# all_labels.extend(labels.cpu().numpy())

# Evaluate the predictions using the specified metrics
result = Evaluation.evaluate(labels, predictions)


+ 7
- 8
data_loader.py View File

@@ -1,6 +1,11 @@
from utils import *
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.impute import SimpleImputer
import os

from utils import DRUG_DATA_FOLDER


class RawDataLoader:
@staticmethod
@@ -39,7 +44,6 @@ class RawDataLoader:
# Step 5: Return the loaded data and adjusted drug screening data
return data, drug_screen


@staticmethod
def intersect_features(data1, data2):
"""
@@ -109,7 +113,6 @@ class RawDataLoader:

return data[0]


@staticmethod
def load_raw_files(raw_file_directory, data_modalities, intersect=True):
raw_dict = {}
@@ -132,8 +135,6 @@ class RawDataLoader:
df.columns = df.columns.str.replace('_cell_CN', '')
df.columns = df.columns.str.replace('_cell_exp', '')



# Note that drug_comp raw table has some NA values so we should impute it
if any(df.isna()):
df = pd.DataFrame(SimpleImputer(strategy='mean').fit_transform(df),
@@ -168,13 +169,12 @@ class RawDataLoader:
@staticmethod
def load_screening_files(filename="AUC_matS_comb.tsv", sep=',', ):
df = pd.read_csv(filename, sep=sep, index_col=0)
# df = df.drop(['Erlotinib','17-AAG','PD-0325901','PHA-665752','PHA-665752','TAE684','Sorafenib','PLX4720','selumetinib','PD-0332991','Paclitaxel','Nilotinib','Saracatinib'],axis=1)
return df
# return pd.read_csv(os.path.join(DATA_FOLDER, "drug_screening_matrix_GDSC.tsv"), sep='\t', index_col=0)

@staticmethod
def adjust_screening_raw(drug_screen, data_dict):
raw_cell_names = []
raw_drug_names = []
for key, value in data_dict.items():
if 'cell' in key:
if len(raw_cell_names) == 0:
@@ -187,7 +187,6 @@ class RawDataLoader:
screening_cell_names = drug_screen.index
screening_drug_names = drug_screen.columns


common_cell_names = list(set(raw_cell_names).intersection(set(screening_cell_names)))
common_drug_names = list(set(raw_drug_names).intersection(set(screening_drug_names)))
for key, value in data_dict.items():

+ 67
- 32
evaluation.py View File

@@ -1,66 +1,102 @@
from sklearn.metrics import roc_auc_score, average_precision_score, confusion_matrix, f1_score, precision_score, \
recall_score, accuracy_score
from matplotlib import pyplot as plt
from sklearn import metrics
import numpy as np
import pandas as pd
from statsmodels.stats.weightstats import ttest_ind

from utils import *


class Evaluation:
@staticmethod
def plot_train_val_accuracy(train_accuracies, val_accuracies, num_epochs):
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.title('h')
plt.plot(range(1, num_epochs + 1), train_accuracies)
plt.plot(range(1, num_epochs + 1), val_accuracies)
"""
Plot training and validation accuracies over epochs.

Parameters:
- train_accuracies (list): List of training accuracies.
- val_accuracies (list): List of validation accuracies.
- num_epochs (int): Number of training epochs.

Returns:
- None
"""
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Training and Validation Accuracies')
plt.plot(range(1, num_epochs + 1), train_accuracies, label='Train Accuracy')
plt.plot(range(1, num_epochs + 1), val_accuracies, label='Validation Accuracy')
plt.legend()
plt.show()

@staticmethod
def plot_train_val_loss(train_loss, val_loss, num_epochs):
plt.xlabel('epoch')
plt.ylabel('loss')
plt.title('h')
plt.plot(range(1, num_epochs + 1), train_loss)
plt.plot(range(1, num_epochs + 1), val_loss)
"""
Plot training and validation losses over epochs.

Parameters:
- train_loss (list): List of training losses.
- val_loss (list): List of validation losses.
- num_epochs (int): Number of training epochs.

Returns:
- None
"""
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Losses')
plt.plot(range(1, num_epochs + 1), train_loss, label='Train Loss')
plt.plot(range(1, num_epochs + 1), val_loss, label='Validation Loss')
plt.legend()
plt.show()


@staticmethod
def evaluate(all_targets, mlp_output, show_plot=True):
"""
Evaluate model performance based on predictions and targets.

Parameters:
- all_targets (numpy.ndarray): True target labels.
- mlp_output (numpy.ndarray): Predicted probabilities.
- show_plot (bool): Whether to display ROC and PR curves.

Returns:
- results (dict): Dictionary containing evaluation metrics.
"""
# Step 1: Convert predicted probabilities to binary labels
predicted_labels = np.where(mlp_output > 0.5, 1, 0)
# Collect predictions and targets for later evaluation
predicted_labels = predicted_labels.reshape(-1)
# Convert predictions and targets to numpy arrays
all_predictions = predicted_labels

# Calculate and print AUC
# Step 2: Calculate and print AUC
fpr, tpr, thresholds = metrics.roc_curve(all_targets, mlp_output)
auc = np.round(metrics.auc(fpr, tpr), 2)

# Calculate and print AUPRC
print(all_targets)
# Step 3: Calculate and print AUPRC
precision, recall, thresholds = metrics.precision_recall_curve(all_targets, mlp_output)
auprc = np.round(metrics.auc(recall, precision), 2)
# auprc = average_precision_score(all_targets, mlp_output)

print('Accuracy: {:.2f}'.format(np.round(accuracy_score(all_targets, all_predictions), 2)))
print('AUC: {:.2f}'.format(auc))
print('AUPRC: {:.2f}'.format(auprc))

# Calculate and print confusion matrix
# Step 4: Print accuracy, AUC, AUPRC, and confusion matrix
accuracy = accuracy_score(all_targets, all_predictions)
cm = confusion_matrix(all_targets, all_predictions)
accuracy = cm.trace() / np.sum(cm)
precision = cm[0, 0] / (cm[0, 0] + cm[0, 1])
recall = cm[0, 0] / (cm[0, 0] + cm[1, 0])
f1_score = 2 * precision * recall / (precision + recall)
print('Confusion matrix:\n', cm, sep='')
print(f'Accuracy: {accuracy:.3f}, Precision: {precision:.3f}, Recall: {recall:.3f}, F1 score: {f1_score:.3f}')
print(f'Accuracy: {accuracy:.2f}')
print(f'AUC: {auc:.2f}')
print(f'AUPRC: {auprc:.2f}')
print(f'Confusion matrix:\n{cm}')
print(f'Precision: {precision:.3f}, Recall: {recall:.3f}, F1 score: {f1_score:.3f}')

# Step 5: Display ROC and PR curves if requested
if show_plot:
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title(f'ROC Curve: AUC={auc}')
plt.plot(fpr, tpr)
plt.show()
# print(f'AUC: {auc}')

plt.xlabel('Recall')
plt.ylabel('Precision')
@@ -68,6 +104,7 @@ class Evaluation:
plt.plot(recall, precision)
plt.show()

# Violin plot for DeepDRA scores
prediction_targets = pd.DataFrame({}, columns=['Prediction', 'Target'])

res = pd.concat(
@@ -76,11 +113,8 @@ class Evaluation:

res.columns = prediction_targets.columns
prediction_targets = pd.concat([prediction_targets, res])

class_one = prediction_targets.loc[prediction_targets['Target'] == 0, 'Prediction'].astype(
np.float32).tolist()
class_minus_one = prediction_targets.loc[prediction_targets['Target'] == 1, 'Prediction'].astype(
np.float32).tolist()
class_one = prediction_targets.loc[prediction_targets['Target'] == 0, 'Prediction']
class_minus_one = prediction_targets.loc[prediction_targets['Target'] == 1, 'Prediction']

fig, ax = plt.subplots()
ax.set_ylabel("DeepDRA score")
@@ -92,12 +126,13 @@ class Evaluation:
p_value = np.format_float_scientific(ttest_ind(class_one, class_minus_one)[1])
cancer = 'all'
plt.title(
f'Responder/Non responder scores for {cancer} cancer with \np-value ~= {p_value[0]}e{p_value[-3:]} ')
f'Responder/Non-responder scores for {cancer} cancer with \np-value ~= {p_value[0]}e{p_value[-3:]} ')
bp = ax.violinplot(data_to_plot, showextrema=True, showmeans=True, showmedians=True)
bp['cmeans'].set_color('r')
bp['cmedians'].set_color('g')
plt.show()

# Step 6: Return evaluation metrics in a dictionary
return {'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1 score': f1_score, 'AUC': auc,
'AUPRC': auprc}


+ 3
- 3
main.py View File

@@ -7,10 +7,10 @@ from DeepDRA import DeepDRA, train, test
from data_loader import RawDataLoader
from evaluation import Evaluation
from utils import *
from mlp import MLP
import random
import torch
import numpy as np
import pandas as pd


def train_DeepDRA(x_cell_train, x_cell_test, x_drug_train, x_drug_test, y_train, y_test, cell_sizes, drug_sizes):
@@ -60,10 +60,10 @@ def train_DeepDRA(x_cell_train, x_cell_test, x_drug_train, x_drug_test, y_train,
train(model, train_loader, num_epochs=num_epochs)

# Step 7: Save the trained model
torch.save(model, 'DeepDRA.pth')
torch.save(model, MODEL_FOLDER + 'DeepDRA.pth')

# Step 8: Load the saved model
model = torch.load('DeepDRA.pth')
model = torch.load( MODEL_FOLDER + 'DeepDRA.pth')

# Step 9: Convert your test data to PyTorch tensors
x_cell_test_tensor = torch.Tensor(x_cell_test.values)

+ 0
- 1
mlp.py View File

@@ -2,7 +2,6 @@ from torch import nn
from torch import optim, no_grad
import torch
from evaluation import Evaluation
from utils import data_modalities_abbreviation
class EarlyStopper:
def __init__(self, patience=1, min_delta=0):
self.patience = patience

+ 7
- 89
utils.py View File

@@ -1,40 +1,15 @@
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import sklearn as sk
from matplotlib import pyplot as plt
from scipy.spatial.distance import pdist, squareform
import h2o
from h2o.estimators import H2ODeepLearningEstimator
from sklearn.impute import SimpleImputer
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import pickle
from sklearn import metrics
from copy import deepcopy
import pyreadr
import requests
from time import time
from math import ceil
from statsmodels.stats.weightstats import ttest_ind
import torch.optim.lr_scheduler as lr_scheduler
from sklearn.model_selection import KFold


DATA_FOLDER = 'data'
RES_DATA_FOLDER = os.path.join(DATA_FOLDER, 'res')
TEST_DATA_FOLDER = os.path.join(DATA_FOLDER, 'final_test_data')
TEST_TCGA_DATA_FOLDER = os.path.join(DATA_FOLDER, 'TCGA_test_data')
SIM_DATA_FOLDER = os.path.join(DATA_FOLDER, 'similarity_data')
RAW_DATA_FOLDER = os.path.join(DATA_FOLDER, 'raw_data')
RAW_BOTH_DATA_FOLDER = os.path.join(DATA_FOLDER, 'CTRP_GDSC_Data')
RAW_BOTH_DATA_FOLDER = os.path.join(DATA_FOLDER, 'CTRP_GDSC_data')
DRUG_DATA_FOLDER = os.path.join(DATA_FOLDER, 'drug_data')

NEW_RAW_DATA_FOLDER = os.path.join(DATA_FOLDER, 'new_raw_data')
GDSC_RAW_DATA_FOLDER = os.path.join(DATA_FOLDER, 'GDSC_data')
CCLE_RAW_DATA_FOLDER = os.path.join(DATA_FOLDER, 'CCLE_raw')
CCLE_RAW_DATA_FOLDER = os.path.join(DATA_FOLDER, 'CCLE_data')
GDSC_SCREENING_DATA_FOLDER = os.path.join(GDSC_RAW_DATA_FOLDER, 'drug_screening_matrix_GDSC.tsv')
CCLE_SCREENING_DATA_FOLDER = os.path.join(CCLE_RAW_DATA_FOLDER, 'drug_screening_matrix_ccle.tsv')
BOTH_SCREENING_DATA_FOLDER = os.path.join(RAW_BOTH_DATA_FOLDER, 'drug_screening_matrix_gdsc_ctrp.tsv')

CTRP_FOLDER = os.path.join(DATA_FOLDER, 'CTRP')
GDSC_FOLDER = os.path.join(DATA_FOLDER, 'GDSC')
@@ -42,32 +17,7 @@ CCLE_FOLDER = os.path.join(DATA_FOLDER, 'CCLE')

MODEL_FOLDER = os.path.join(DATA_FOLDER, 'model')

CTRP_EXPERIMENT_FILE = os.path.join(CTRP_FOLDER, 'v20.meta.per_experiment.txt')
CTRP_COMPOUND_FILE = os.path.join(CTRP_FOLDER, 'v20.meta.per_compound.txt')
CTRP_CELLLINE_FILE = os.path.join(CTRP_FOLDER, 'v20.meta.per_cell_line.txt')
CTRP_AUC_FILE = os.path.join(CTRP_FOLDER, 'v20.data.curves_post_qc.txt')

GDSC_AUC_FILE = os.path.join(GDSC_FOLDER, 'GDSC2_fitted_dose_response.csv')
GDSC_cnv_data_FILE = os.path.join(GDSC_FOLDER, 'cnv_abs_copy_number_picnic_20191101.csv')
GDSC_methy_data_FILE = os.path.join(GDSC_FOLDER, 'F2_METH_CELL_DATA.txt')
GDSC_methy_sampleIds_FILE = os.path.join(GDSC_FOLDER, 'methSampleId_2_cosmicIds.xlsx')
GDSC_exp_data_FILE = os.path.join(GDSC_FOLDER, 'Cell_line_RMA_proc_basalExp.txt')
GDSC_exp_sampleIds_FILE = os.path.join(GDSC_FOLDER, 'E-MTAB-3610.sdrf.txt')
GDSC_mut_data_FILE = os.path.join(GDSC_FOLDER, 'mutations_all_20230202.csv')
GDSC_SCREENING_DATA_FOLDER = os.path.join(GDSC_RAW_DATA_FOLDER, 'drug_screening_matrix_GDSC.tsv')
CCLE_SCREENING_DATA_FOLDER = os.path.join(CCLE_RAW_DATA_FOLDER, 'drug_screening_matrix_ccle.tsv')
BOTH_SCREENING_DATA_FOLDER = os.path.join(RAW_BOTH_DATA_FOLDER, 'drug_screening_matrix_gdsc_ctrp.tsv')

CCLE_mut_data_FILE = os.path.join(CCLE_FOLDER, 'CCLE_mutations.csv')

TABLE_RESULTS_FILE = os.path.join(DATA_FOLDER, 'drug_screening_table.tsv')
MATRIX_RESULTS_FILE = os.path.join(DATA_FOLDER, 'drug_screening_matrix.tsv')

MODEL_FILE = os.path.join(MODEL_FOLDER, 'trained_model_V1_EMDP.sav')
TEST_FILE = os.path.join(TEST_DATA_FOLDER, 'test.gzip')
RESULT_FILE = os.path.join(RES_DATA_FOLDER, 'result.tsv')

TCGA_DATA_FOLDER = os.path.join(DATA_FOLDER, 'TCGA_test_data')
TCGA_DATA_FOLDER = os.path.join(DATA_FOLDER, 'TCGA_data')
TCGA_SCREENING_DATA = os.path.join(TCGA_DATA_FOLDER, 'TCGA_screening_matrix.tsv')

BUILD_SIM_MATRICES = True # Make this variable True to build similarity matrices from raw data
@@ -81,36 +31,4 @@ DATA_MODALITIES = ['cell_mut', 'drug_desc', 'drug_finger']
RANDOM_SEED = 42 # Must be used wherever can be used


def data_modalities_abbreviation():
abb = []
if 'cell_CN' in DATA_MODALITIES:
abb.append('C')
if 'cell_exp' in DATA_MODALITIES:
abb.append('E')
if 'cell_mut' in DATA_MODALITIES:
abb.append('M')
if 'cell_methy' in DATA_MODALITIES:
abb.append('T')
if 'drug_DT' in DATA_MODALITIES:
abb.append('D')
if 'drug_comp' in DATA_MODALITIES:
abb.append('P')
return ''.join(abb)


""" TRAIN_INTEGRATION_METHOD used for each cell's and drug_data's data definitions:
SIMILARITY: A kernel based integration method in which based on the similarity of each cell's data with the training cell's
data the input features for the multi layer perceptron (MLP) is constructed. The similarity function used could be different for
each data modality (euclidean, jaccard,l1_norm, or ...)

AUTO_ENCODER_V1: In this version of integrating multi-omics, for each data modality an autoencoder is trained to reduce the
dimension of the features and finally a concatenation of each autoencoder's latent space builds up the input layer of the MLP.

AUTO_ENCODER_V2: In this version of integrating multi-omics data, we train a big autoencoder which reduces the dimension of
all the different data modalities features at the same time to a smaller feature space. This version of integrating could
take a lot of memory and time to integrate the data and might be computationally expensive.

AUTO_ENCODER_V3: IN this version of integrating multi-omics data, we train an autoencoder for all the modalities kinda same as
the autoencoder version 2 but with this difference that the encoder and decoder layers are separate from each other and
just the latent layer is shared among different data modalities.
"""

Loading…
Cancel
Save