You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

main.py 7.0KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159
  1. from imblearn.under_sampling import RandomUnderSampler
  2. from sklearn.model_selection import train_test_split
  3. from torch.utils.data import TensorDataset, DataLoader, SubsetRandomSampler
  4. from DeepDRA import DeepDRA, train, test
  5. from data_loader import RawDataLoader
  6. from evaluation import Evaluation
  7. from utils import *
  8. from mlp import MLP
  9. import random
  10. import torch
  11. import numpy as np
  12. def train_DeepDRA(x_cell_train, x_cell_test, x_drug_train, x_drug_test, y_train, y_test, cell_sizes, drug_sizes):
  13. """
  14. Train and evaluate the DeepDRA model.
  15. Parameters:
  16. - X_cell_train (pd.DataFrame): Training data for the cell modality.
  17. - X_cell_test (pd.DataFrame): Test data for the cell modality.
  18. - X_drug_train (pd.DataFrame): Training data for the drug modality.
  19. - X_drug_test (pd.DataFrame): Test data for the drug modality.
  20. - y_train (pd.Series): Training labels.
  21. - y_test (pd.Series): Test labels.
  22. - cell_sizes (list): Sizes of the cell modality features.
  23. - drug_sizes (list): Sizes of the drug modality features.
  24. Returns:
  25. - result: Evaluation result on the test set.
  26. """
  27. # Step 1: Define the batch size for training
  28. batch_size = 64
  29. # Step 2: Instantiate the combined model
  30. ae_latent_dim = 50
  31. mlp_input_dim = 2 * ae_latent_dim
  32. mlp_output_dim = 1
  33. num_epochs = 20
  34. model = DeepDRA(cell_sizes, drug_sizes, ae_latent_dim, ae_latent_dim, mlp_input_dim, mlp_output_dim)
  35. # Step 3: Convert your training data to PyTorch tensors
  36. x_cell_train_tensor = torch.Tensor(x_cell_train.values)
  37. x_drug_train_tensor = torch.Tensor(x_drug_train.values)
  38. x_cell_train_tensor = torch.nn.functional.normalize(x_cell_train_tensor, dim=0)
  39. x_drug_train_tensor = torch.nn.functional.normalize(x_drug_train_tensor, dim=0)
  40. y_train_tensor = torch.Tensor(y_train)
  41. y_train_tensor = y_train_tensor.unsqueeze(1)
  42. # Step 4: Create a TensorDataset with the input features and target labels
  43. train_dataset = TensorDataset(x_cell_train_tensor, x_drug_train_tensor, y_train_tensor)
  44. # Step 5: Create the train_loader
  45. train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
  46. # Step 6: Train the model
  47. train(model, train_loader, num_epochs=num_epochs)
  48. # Step 7: Save the trained model
  49. torch.save(model, 'DeepDRA.pth')
  50. # Step 8: Load the saved model
  51. model = torch.load('DeepDRA.pth')
  52. # Step 9: Convert your test data to PyTorch tensors
  53. x_cell_test_tensor = torch.Tensor(x_cell_test.values)
  54. x_drug_test_tensor = torch.Tensor(x_drug_test.values)
  55. y_test_tensor = torch.Tensor(y_test)
  56. # normalize data
  57. x_cell_test_tensor = torch.nn.functional.normalize(x_cell_test_tensor, dim=0)
  58. x_drug_test_tensor = torch.nn.functional.normalize(x_drug_test_tensor, dim=0)
  59. # Step 10: Create a TensorDataset with the input features and target labels for testing
  60. test_dataset = TensorDataset(x_cell_test_tensor, x_drug_test_tensor, y_test_tensor)
  61. test_loader = DataLoader(test_dataset, batch_size=len(x_cell_test))
  62. # Step 11: Test the model
  63. return test(model, test_loader)
  64. def run(k, is_test=False):
  65. """
  66. Run the training and evaluation process k times.
  67. Parameters:
  68. - k (int): Number of times to run the process.
  69. - is_test (bool): If True, run on test data; otherwise, perform train-validation split.
  70. Returns:
  71. - history (dict): Dictionary containing evaluation metrics for each run.
  72. """
  73. # Step 1: Initialize a dictionary to store evaluation metrics
  74. history = {'AUC': [], 'AUPRC': [], "Accuracy": [], "Precision": [], "Recall": [], "F1 score": []}
  75. # Step 2: Load training data
  76. train_data, train_drug_screen = RawDataLoader.load_data(data_modalities=DATA_MODALITIES,
  77. raw_file_directory=GDSC_RAW_DATA_FOLDER,
  78. screen_file_directory=GDSC_SCREENING_DATA_FOLDER,
  79. sep="\t")
  80. # Step 3: Load test data if applicable
  81. if is_test:
  82. test_data, test_drug_screen = RawDataLoader.load_data(data_modalities=DATA_MODALITIES,
  83. raw_file_directory=CCLE_RAW_DATA_FOLDER,
  84. screen_file_directory=CCLE_SCREENING_DATA_FOLDER,
  85. sep="\t")
  86. train_data, test_data = RawDataLoader.data_features_intersect(train_data, test_data)
  87. X_cell_test, X_drug_test, y_test, cell_sizes, drug_sizes = RawDataLoader.prepare_input_data(test_data,
  88. test_drug_screen)
  89. # Step 4: Prepare input data for training
  90. X_cell_train, X_drug_train, y_train, cell_sizes, drug_sizes = RawDataLoader.prepare_input_data(train_data,
  91. train_drug_screen)
  92. # Step 5: Loop over k runs
  93. for i in range(k):
  94. print('Run {}'.format(i))
  95. # Step 6: If is_test is True, perform random under-sampling on the training data
  96. if is_test:
  97. rus = RandomUnderSampler(sampling_strategy="majority", random_state=RANDOM_SEED)
  98. dataset = pd.concat([X_cell_train, X_drug_train], axis=1)
  99. dataset.index = X_cell_train.index
  100. dataset, y_train = rus.fit_resample(dataset, y_train)
  101. X_cell_train = dataset.iloc[:, :sum(cell_sizes)]
  102. X_drug_train = dataset.iloc[:, sum(cell_sizes):]
  103. # Step 7: Train and evaluate the DeepDRA model on test data
  104. results = train_DeepDRA(X_cell_train, X_cell_test, X_drug_train, X_drug_test, y_train, y_test, cell_sizes,
  105. drug_sizes)
  106. else:
  107. # Step 8: Split the data into training and validation sets
  108. X_cell_train, X_cell_test, X_drug_train, X_drug_test, y_train, y_test = train_test_split(X_cell_train,
  109. X_drug_train, y_train,
  110. test_size=0.2,
  111. random_state=44,
  112. shuffle=True)
  113. # Step 9: Train and evaluate the DeepDRA model on the split data
  114. results = train_DeepDRA(X_cell_train, X_cell_test, X_drug_train, X_drug_test, y_train, y_test, cell_sizes,
  115. drug_sizes)
  116. # Step 10: Add results to the history dictionary
  117. Evaluation.add_results(history, results)
  118. # Step 11: Display final results
  119. Evaluation.show_final_results(history)
  120. return history
  121. if __name__ == '__main__':
  122. torch.manual_seed(RANDOM_SEED)
  123. random.seed(RANDOM_SEED)
  124. np.random.seed(RANDOM_SEED)
  125. run(10, is_test=True)