DeepTraCDR: Prediction Cancer Drug Response using multimodal deep learning with Transformers
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

data_loader.py 7.2KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163
  1. import pandas as pd
  2. import numpy as np
  3. import scipy.sparse as sp
  4. from utils import *
  5. def load_data(args):
  6. """
  7. Loads dataset based on the specified data type.
  8. Args:
  9. args: Object containing configuration parameters, including the dataset type.
  10. Returns:
  11. Tuple containing adjacency matrix, drug fingerprints, expression data, null mask, positive edge count, and updated args.
  12. Raises:
  13. NotImplementedError: If the specified dataset is not supported.
  14. """
  15. if args.data == 'gdsc':
  16. return _load_gdsc(args)
  17. elif args.data == 'ccle':
  18. return _load_ccle(args)
  19. elif args.data == 'pdx':
  20. return _load_pdx(args)
  21. elif args.data == 'tcga':
  22. return _load_tcga(args)
  23. else:
  24. raise NotImplementedError(f"Dataset {args.data} is not supported.")
  25. def _load_gdsc(args):
  26. """
  27. Loads GDSC dataset, including cell-drug response, drug fingerprints, gene expression, and null mask.
  28. Args:
  29. args: Configuration object to be updated with dataset-specific parameters.
  30. Returns:
  31. Tuple of response matrix, drug fingerprints, expression data, null mask, positive edge count, and updated args.
  32. """
  33. args.alpha = 0.25
  34. args.layer_size = [512, 512]
  35. # Load drug fingerprints
  36. drug_fingerprints = [
  37. pd.read_csv("/media/external_16TB_1/ali_kianfar/Data/GDSC/ECFP6_fingerprints_GDSC.csv", index_col=0).values.astype(np.float32)
  38. ]
  39. # Load response, expression, and null mask data
  40. res = pd.read_csv("/media/external_16TB_1/ali_kianfar/Data/GDSC/cell_drug_binary.csv", index_col=0).values.astype(np.float32)
  41. exprs = pd.read_csv("/media/external_16TB_1/ali_kianfar/Data/GDSC/cell_gene/merged_file_GDSC.csv", index_col=0).values.astype(np.float32)
  42. null_mask = pd.read_csv("/media/external_16TB_1/ali_kianfar/Data/GDSC/null_mask.csv", index_col=0).values.astype(np.float32)
  43. pos_num = sp.coo_matrix(res).data.shape[0]
  44. return res, drug_fingerprints, exprs, null_mask, pos_num, args
  45. def _load_ccle(args):
  46. """
  47. Loads CCLE dataset, including cell-drug response, drug fingerprints, gene expression, and null mask.
  48. Args:
  49. args: Configuration object to be updated with dataset-specific parameters.
  50. Returns:
  51. Tuple of response matrix, drug fingerprints, expression data, null mask, positive edge count, and updated args.
  52. """
  53. args.alpha = 0.45
  54. args.layer_size = [512, 512]
  55. # Load drug fingerprints
  56. drug_fingerprints = [
  57. pd.read_csv("/media/external_16TB_1/ali_kianfar/Data/CCLE/ECFP6_fingerprints.csv", index_col=0).values.astype(np.float32)
  58. ]
  59. # Load response and expression data, initialize null mask
  60. res = pd.read_csv("/media/external_16TB_1/ali_kianfar/Data/CCLE/cell_drug_binary.csv", index_col=0).values.astype(np.float32)
  61. exprs = pd.read_csv("/media/external_16TB_1/ali_kianfar/Data/CCLE/merged_file.csv", index_col=0).values.astype(np.float32)
  62. null_mask = np.zeros(res.shape, dtype=np.float32)
  63. pos_num = sp.coo_matrix(res).data.shape[0]
  64. return res, drug_fingerprints, exprs, null_mask, pos_num, args
  65. def _load_pdx(args):
  66. """
  67. Loads PDX dataset by merging GDSC and PDX data, aligning gene expression by common genes.
  68. Args:
  69. args: Configuration object to be updated with dataset-specific parameters.
  70. Returns:
  71. Tuple of merged response matrix, drug fingerprints, merged expression data, merged null mask, training row count, and updated args.
  72. """
  73. args.alpha = 0.15
  74. args.layer_size = [1024, 1024]
  75. # Load response matrices
  76. gdsc_res = pd.read_csv("/media/external_16TB_1/ali_kianfar/Data/GDSC/cell_drug_binary.csv", index_col=0).values.astype(np.float32)
  77. pdx_res = pd.read_csv("/media/external_16TB_1/ali_kianfar/Data/PDX/pdx_response.csv", index_col=0).values.astype(np.float32)
  78. res = np.concatenate((gdsc_res, pdx_res), axis=0)
  79. train_row = gdsc_res.shape[0]
  80. # Load drug fingerprints
  81. drug_finger = [
  82. pd.read_csv("/media/external_16TB_1/ali_kianfar/Data/GDSC/ECFP6_fingerprints_GDSC.csv", index_col=0).values.astype(np.float32)
  83. ]
  84. # Load and align gene expression data
  85. gdsc_exprs_df = pd.read_csv("/media/external_16TB_1/ali_kianfar/Data/GDSC/cell_gene/merged_file_GDSC.csv", index_col=0)
  86. pdx_exprs_df = pd.read_csv("/media/external_16TB_1/ali_kianfar/Data/PDX/pdx_exprs.csv", index_col=0)
  87. common_genes = gdsc_exprs_df.columns.intersection(pdx_exprs_df.columns)
  88. gdsc_exprs_filtered = gdsc_exprs_df[common_genes].values.astype(np.float32)
  89. pdx_exprs_filtered = pdx_exprs_df[common_genes].values.astype(np.float32)
  90. exprs = np.concatenate((gdsc_exprs_filtered, pdx_exprs_filtered), axis=0)
  91. # Load and merge null masks
  92. gdsc_null_mask = pd.read_csv("/media/external_16TB_1/ali_kianfar/Data/GDSC/null_mask.csv", index_col=0).values.astype(np.float32)
  93. pdx_null_mask = pd.read_csv("/media/external_16TB_1/ali_kianfar/Data/PDX/pdx_null_mask.csv", index_col=0).values.astype(np.float32)
  94. null_mask = np.concatenate((gdsc_null_mask, pdx_null_mask), axis=0)
  95. return res, drug_finger, exprs, null_mask, train_row, args
  96. def _load_tcga(args):
  97. """
  98. Loads TCGA dataset by merging GDSC and TCGA data, aligning gene expression by common genes.
  99. Args:
  100. args: Configuration object to be updated with dataset-specific parameters.
  101. Returns:
  102. Tuple of merged response matrix, drug fingerprints, merged expression data, merged null mask, training row count, and updated args.
  103. """
  104. args.alpha = 0.1
  105. args.layer_size = [1024, 1024]
  106. # Load response matrices
  107. gdsc_res = pd.read_csv("/media/external_16TB_1/ali_kianfar/Data/GDSC/cell_drug_binary.csv", index_col=0).values.astype(np.float32)
  108. tcga_res = pd.read_csv("/media/external_16TB_1/ali_kianfar/Data/TCGA/patient_drug_binary.csv", index_col=0).values.astype(np.float32)
  109. res = np.concatenate((gdsc_res, tcga_res), axis=0)
  110. train_row = gdsc_res.shape[0]
  111. # Load drug fingerprints
  112. drug_finger = [
  113. pd.read_csv("/media/external_16TB_1/ali_kianfar/Data/GDSC/ECFP6_fingerprints_GDSC.csv", index_col=0).values.astype(np.float32)
  114. ]
  115. # Load and align gene expression data
  116. gdsc_exprs = pd.read_csv("/media/external_16TB_1/ali_kianfar/Data/GDSC/cell_gene/merged_file_GDSC.csv", index_col=0)
  117. patient_gene = pd.read_csv("/media/external_16TB_1/ali_kianfar/Data/TCGA/tcga_gene_exprs.csv", index_col=0)
  118. common_genes = gdsc_exprs.columns.intersection(patient_gene.columns)
  119. gdsc_exprs_filtered = gdsc_exprs[common_genes].values.astype(np.float32)
  120. tcga_exprs_filtered = patient_gene[common_genes].values.astype(np.float32)
  121. exprs = np.concatenate((gdsc_exprs_filtered, tcga_exprs_filtered), axis=0)
  122. # Load and merge null masks
  123. gdsc_null_mask = pd.read_csv("/media/external_16TB_1/ali_kianfar/Data/GDSC/null_mask.csv", index_col=0).values.astype(np.float32)
  124. tcga_null_mask = pd.read_csv("/media/external_16TB_1/ali_kianfar/Data/TCGA/null_mask.csv", index_col=0).values.astype(np.float32)
  125. null_mask = np.concatenate((gdsc_null_mask, tcga_null_mask), axis=0)
  126. return res, drug_finger, exprs, null_mask, train_row, args