You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

utils.py 5.6KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116
  1. import os
  2. import pandas as pd
  3. import numpy as np
  4. from tqdm import tqdm
  5. import sklearn as sk
  6. from matplotlib import pyplot as plt
  7. from scipy.spatial.distance import pdist, squareform
  8. import h2o
  9. from h2o.estimators import H2ODeepLearningEstimator
  10. from sklearn.impute import SimpleImputer
  11. import torch
  12. import torch.nn as nn
  13. import torch.optim as optim
  14. from torch.utils.data import DataLoader, TensorDataset
  15. import pickle
  16. from sklearn import metrics
  17. from copy import deepcopy
  18. import pyreadr
  19. import requests
  20. from time import time
  21. from math import ceil
  22. from statsmodels.stats.weightstats import ttest_ind
  23. import torch.optim.lr_scheduler as lr_scheduler
  24. from sklearn.model_selection import KFold
  25. DATA_FOLDER = 'data'
  26. RES_DATA_FOLDER = os.path.join(DATA_FOLDER, 'res')
  27. TEST_DATA_FOLDER = os.path.join(DATA_FOLDER, 'final_test_data')
  28. TEST_TCGA_DATA_FOLDER = os.path.join(DATA_FOLDER, 'TCGA_test_data')
  29. SIM_DATA_FOLDER = os.path.join(DATA_FOLDER, 'similarity_data')
  30. RAW_DATA_FOLDER = os.path.join(DATA_FOLDER, 'raw_data')
  31. RAW_BOTH_DATA_FOLDER = os.path.join(DATA_FOLDER, 'CTRP_GDSC_Data')
  32. DRUG_DATA_FOLDER = os.path.join(DATA_FOLDER, 'drug_data')
  33. NEW_RAW_DATA_FOLDER = os.path.join(DATA_FOLDER, 'new_raw_data')
  34. GDSC_RAW_DATA_FOLDER = os.path.join(DATA_FOLDER, 'GDSC_data')
  35. CCLE_RAW_DATA_FOLDER = os.path.join(DATA_FOLDER, 'CCLE_raw')
  36. CTRP_FOLDER = os.path.join(DATA_FOLDER, 'CTRP')
  37. GDSC_FOLDER = os.path.join(DATA_FOLDER, 'GDSC')
  38. CCLE_FOLDER = os.path.join(DATA_FOLDER, 'CCLE')
  39. MODEL_FOLDER = os.path.join(DATA_FOLDER, 'model')
  40. CTRP_EXPERIMENT_FILE = os.path.join(CTRP_FOLDER, 'v20.meta.per_experiment.txt')
  41. CTRP_COMPOUND_FILE = os.path.join(CTRP_FOLDER, 'v20.meta.per_compound.txt')
  42. CTRP_CELLLINE_FILE = os.path.join(CTRP_FOLDER, 'v20.meta.per_cell_line.txt')
  43. CTRP_AUC_FILE = os.path.join(CTRP_FOLDER, 'v20.data.curves_post_qc.txt')
  44. GDSC_AUC_FILE = os.path.join(GDSC_FOLDER, 'GDSC2_fitted_dose_response.csv')
  45. GDSC_cnv_data_FILE = os.path.join(GDSC_FOLDER, 'cnv_abs_copy_number_picnic_20191101.csv')
  46. GDSC_methy_data_FILE = os.path.join(GDSC_FOLDER, 'F2_METH_CELL_DATA.txt')
  47. GDSC_methy_sampleIds_FILE = os.path.join(GDSC_FOLDER, 'methSampleId_2_cosmicIds.xlsx')
  48. GDSC_exp_data_FILE = os.path.join(GDSC_FOLDER, 'Cell_line_RMA_proc_basalExp.txt')
  49. GDSC_exp_sampleIds_FILE = os.path.join(GDSC_FOLDER, 'E-MTAB-3610.sdrf.txt')
  50. GDSC_mut_data_FILE = os.path.join(GDSC_FOLDER, 'mutations_all_20230202.csv')
  51. GDSC_SCREENING_DATA_FOLDER = os.path.join(GDSC_RAW_DATA_FOLDER, 'drug_screening_matrix_GDSC.tsv')
  52. CCLE_SCREENING_DATA_FOLDER = os.path.join(CCLE_RAW_DATA_FOLDER, 'drug_screening_matrix_ccle.tsv')
  53. BOTH_SCREENING_DATA_FOLDER = os.path.join(RAW_BOTH_DATA_FOLDER, 'drug_screening_matrix_gdsc_ctrp.tsv')
  54. CCLE_mut_data_FILE = os.path.join(CCLE_FOLDER, 'CCLE_mutations.csv')
  55. TABLE_RESULTS_FILE = os.path.join(DATA_FOLDER, 'drug_screening_table.tsv')
  56. MATRIX_RESULTS_FILE = os.path.join(DATA_FOLDER, 'drug_screening_matrix.tsv')
  57. MODEL_FILE = os.path.join(MODEL_FOLDER, 'trained_model_V1_EMDP.sav')
  58. TEST_FILE = os.path.join(TEST_DATA_FOLDER, 'test.gzip')
  59. RESULT_FILE = os.path.join(RES_DATA_FOLDER, 'result.tsv')
  60. TCGA_DATA_FOLDER = os.path.join(DATA_FOLDER, 'TCGA_test_data')
  61. TCGA_SCREENING_DATA = os.path.join(TCGA_DATA_FOLDER, 'TCGA_screening_matrix.tsv')
  62. BUILD_SIM_MATRICES = True # Make this variable True to build similarity matrices from raw data
  63. SIM_KERNEL = {'cell_CN': ('euclidean', 0.001), 'cell_exp': ('euclidean', 0.01), 'cell_methy': ('euclidean', 0.1),
  64. 'cell_mut': ('jaccard', 1), 'drug_DT': ('jaccard', 1), 'drug_comp': ('euclidean', 0.001),
  65. 'drug_desc': ('euclidean', 0.001), 'drug_finger': ('euclidean', 0.001)}
  66. SAVE_MODEL = False # Change it to True to save the trained model
  67. VARIATIONAL_AUTOENCODERS = False
  68. # DATA_MODALITIES=['cell_CN','cell_exp','cell_methy','cell_mut','drug_comp','drug_DT'] # Change this list to only consider specific data modalities
  69. DATA_MODALITIES = ['cell_mut', 'drug_desc', 'drug_finger']
  70. RANDOM_SEED = 42 # Must be used wherever can be used
  71. def data_modalities_abbreviation():
  72. abb = []
  73. if 'cell_CN' in DATA_MODALITIES:
  74. abb.append('C')
  75. if 'cell_exp' in DATA_MODALITIES:
  76. abb.append('E')
  77. if 'cell_mut' in DATA_MODALITIES:
  78. abb.append('M')
  79. if 'cell_methy' in DATA_MODALITIES:
  80. abb.append('T')
  81. if 'drug_DT' in DATA_MODALITIES:
  82. abb.append('D')
  83. if 'drug_comp' in DATA_MODALITIES:
  84. abb.append('P')
  85. return ''.join(abb)
  86. """ TRAIN_INTEGRATION_METHOD used for each cell's and drug_data's data definitions:
  87. SIMILARITY: A kernel based integration method in which based on the similarity of each cell's data with the training cell's
  88. data the input features for the multi layer perceptron (MLP) is constructed. The similarity function used could be different for
  89. each data modality (euclidean, jaccard,l1_norm, or ...)
  90. AUTO_ENCODER_V1: In this version of integrating multi-omics, for each data modality an autoencoder is trained to reduce the
  91. dimension of the features and finally a concatenation of each autoencoder's latent space builds up the input layer of the MLP.
  92. AUTO_ENCODER_V2: In this version of integrating multi-omics data, we train a big autoencoder which reduces the dimension of
  93. all the different data modalities features at the same time to a smaller feature space. This version of integrating could
  94. take a lot of memory and time to integrate the data and might be computationally expensive.
  95. AUTO_ENCODER_V3: IN this version of integrating multi-omics data, we train an autoencoder for all the modalities kinda same as
  96. the autoencoder version 2 but with this difference that the encoder and decoder layers are separate from each other and
  97. just the latent layer is shared among different data modalities.
  98. """