taha.mohammadzadeh
/
DeepDRA


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116
							import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import sklearn as sk
from matplotlib import pyplot as plt
from scipy.spatial.distance import pdist, squareform
import h2o
from h2o.estimators import H2ODeepLearningEstimator
from sklearn.impute import SimpleImputer
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import pickle
from sklearn import metrics
from copy import deepcopy
import pyreadr
import requests
from time import time
from math import ceil
from statsmodels.stats.weightstats import ttest_ind
import torch.optim.lr_scheduler as lr_scheduler
from sklearn.model_selection import KFold

DATA_FOLDER = 'data'
RES_DATA_FOLDER = os.path.join(DATA_FOLDER, 'res')
TEST_DATA_FOLDER = os.path.join(DATA_FOLDER, 'final_test_data')
TEST_TCGA_DATA_FOLDER = os.path.join(DATA_FOLDER, 'TCGA_test_data')
SIM_DATA_FOLDER = os.path.join(DATA_FOLDER, 'similarity_data')
RAW_DATA_FOLDER = os.path.join(DATA_FOLDER, 'raw_data')
RAW_BOTH_DATA_FOLDER = os.path.join(DATA_FOLDER, 'CTRP_GDSC_Data')
DRUG_DATA_FOLDER = os.path.join(DATA_FOLDER, 'drug_data')

NEW_RAW_DATA_FOLDER = os.path.join(DATA_FOLDER, 'new_raw_data')
GDSC_RAW_DATA_FOLDER = os.path.join(DATA_FOLDER, 'GDSC_data')
CCLE_RAW_DATA_FOLDER = os.path.join(DATA_FOLDER, 'CCLE_raw')

CTRP_FOLDER = os.path.join(DATA_FOLDER, 'CTRP')
GDSC_FOLDER = os.path.join(DATA_FOLDER, 'GDSC')
CCLE_FOLDER = os.path.join(DATA_FOLDER, 'CCLE')

MODEL_FOLDER = os.path.join(DATA_FOLDER, 'model')

CTRP_EXPERIMENT_FILE = os.path.join(CTRP_FOLDER, 'v20.meta.per_experiment.txt')
CTRP_COMPOUND_FILE = os.path.join(CTRP_FOLDER, 'v20.meta.per_compound.txt')
CTRP_CELLLINE_FILE = os.path.join(CTRP_FOLDER, 'v20.meta.per_cell_line.txt')
CTRP_AUC_FILE = os.path.join(CTRP_FOLDER, 'v20.data.curves_post_qc.txt')

GDSC_AUC_FILE = os.path.join(GDSC_FOLDER, 'GDSC2_fitted_dose_response.csv')
GDSC_cnv_data_FILE = os.path.join(GDSC_FOLDER, 'cnv_abs_copy_number_picnic_20191101.csv')
GDSC_methy_data_FILE = os.path.join(GDSC_FOLDER, 'F2_METH_CELL_DATA.txt')
GDSC_methy_sampleIds_FILE = os.path.join(GDSC_FOLDER, 'methSampleId_2_cosmicIds.xlsx')
GDSC_exp_data_FILE = os.path.join(GDSC_FOLDER, 'Cell_line_RMA_proc_basalExp.txt')
GDSC_exp_sampleIds_FILE = os.path.join(GDSC_FOLDER, 'E-MTAB-3610.sdrf.txt')
GDSC_mut_data_FILE = os.path.join(GDSC_FOLDER, 'mutations_all_20230202.csv')
GDSC_SCREENING_DATA_FOLDER = os.path.join(GDSC_RAW_DATA_FOLDER, 'drug_screening_matrix_GDSC.tsv')
CCLE_SCREENING_DATA_FOLDER = os.path.join(CCLE_RAW_DATA_FOLDER, 'drug_screening_matrix_ccle.tsv')
BOTH_SCREENING_DATA_FOLDER = os.path.join(RAW_BOTH_DATA_FOLDER, 'drug_screening_matrix_gdsc_ctrp.tsv')

CCLE_mut_data_FILE = os.path.join(CCLE_FOLDER, 'CCLE_mutations.csv')

TABLE_RESULTS_FILE = os.path.join(DATA_FOLDER, 'drug_screening_table.tsv')
MATRIX_RESULTS_FILE = os.path.join(DATA_FOLDER, 'drug_screening_matrix.tsv')

MODEL_FILE = os.path.join(MODEL_FOLDER, 'trained_model_V1_EMDP.sav')
TEST_FILE = os.path.join(TEST_DATA_FOLDER, 'test.gzip')
RESULT_FILE = os.path.join(RES_DATA_FOLDER, 'result.tsv')

TCGA_DATA_FOLDER = os.path.join(DATA_FOLDER, 'TCGA_test_data')
TCGA_SCREENING_DATA = os.path.join(TCGA_DATA_FOLDER, 'TCGA_screening_matrix.tsv')

BUILD_SIM_MATRICES = True  # Make this variable True to build similarity matrices from raw data
SIM_KERNEL = {'cell_CN': ('euclidean', 0.001), 'cell_exp': ('euclidean', 0.01), 'cell_methy': ('euclidean', 0.1),
              'cell_mut': ('jaccard', 1), 'drug_DT': ('jaccard', 1), 'drug_comp': ('euclidean', 0.001),
              'drug_desc': ('euclidean', 0.001), 'drug_finger': ('euclidean', 0.001)}
SAVE_MODEL = False  # Change it to True to save the trained model
VARIATIONAL_AUTOENCODERS = False
# DATA_MODALITIES=['cell_CN','cell_exp','cell_methy','cell_mut','drug_comp','drug_DT'] # Change this list to only consider specific data modalities
DATA_MODALITIES = ['cell_mut', 'drug_desc', 'drug_finger']
RANDOM_SEED = 42  # Must be used wherever can be used


def data_modalities_abbreviation():
    abb = []
    if 'cell_CN' in DATA_MODALITIES:
        abb.append('C')
    if 'cell_exp' in DATA_MODALITIES:
        abb.append('E')
    if 'cell_mut' in DATA_MODALITIES:
        abb.append('M')
    if 'cell_methy' in DATA_MODALITIES:
        abb.append('T')
    if 'drug_DT' in DATA_MODALITIES:
        abb.append('D')
    if 'drug_comp' in DATA_MODALITIES:
        abb.append('P')
    return ''.join(abb)


""" TRAIN_INTEGRATION_METHOD used for each cell's and drug_data's data definitions: 
SIMILARITY: A kernel based integration method in which based on the similarity of each cell's data with the training cell's
data the input features for the multi layer perceptron (MLP) is constructed. The similarity function used could be different for
each data modality (euclidean, jaccard,l1_norm, or ...)

AUTO_ENCODER_V1: In this version of integrating multi-omics, for each data modality an autoencoder is trained to reduce the
dimension of the features and finally a concatenation of each autoencoder's latent space builds up the input layer of the MLP.

AUTO_ENCODER_V2: In this version of integrating multi-omics data, we train a big autoencoder which reduces the dimension of 
all the different data modalities features at the same time to a smaller feature space. This version of integrating could
take a lot of memory and time to integrate the data and might be computationally expensive.

AUTO_ENCODER_V3: IN this version of integrating multi-omics data, we train an autoencoder for all the modalities kinda same as 
the autoencoder version 2 but with this difference that the encoder and decoder layers are separate from each other and 
just the latent layer is shared among different data modalities.
"""