You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

main.py 8.0KB

11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177
  1. from imblearn.under_sampling import RandomUnderSampler
  2. from sklearn.model_selection import train_test_split
  3. from sklearn.utils.class_weight import compute_class_weight
  4. from torch.utils.data import TensorDataset, DataLoader, SubsetRandomSampler
  5. from DeepDRA import DeepDRA, train, test
  6. from data_loader import RawDataLoader
  7. from evaluation import Evaluation
  8. from utils import *
  9. import random
  10. import torch
  11. import numpy as np
  12. import pandas as pd
  13. def train_DeepDRA(x_cell_train, x_cell_test, x_drug_train, x_drug_test, y_train, y_test, cell_sizes, drug_sizes,device):
  14. """
  15. Train and evaluate the DeepDRA model.
  16. Parameters:
  17. - X_cell_train (pd.DataFrame): Training data for the cell modality.
  18. - X_cell_test (pd.DataFrame): Test data for the cell modality.
  19. - X_drug_train (pd.DataFrame): Training data for the drug modality.
  20. - X_drug_test (pd.DataFrame): Test data for the drug modality.
  21. - y_train (pd.Series): Training labels.
  22. - y_test (pd.Series): Test labels.
  23. - cell_sizes (list): Sizes of the cell modality features.
  24. - drug_sizes (list): Sizes of the drug modality features.
  25. Returns:
  26. - result: Evaluation result on the test set.
  27. """
  28. # Step 1: Define the batch size for training
  29. batch_size = 64
  30. # Step 2: Instantiate the combined model
  31. ae_latent_dim = 50
  32. mlp_input_dim = 2 * ae_latent_dim
  33. mlp_output_dim = 1
  34. num_epochs = 25
  35. model = DeepDRA(cell_sizes, drug_sizes, ae_latent_dim, ae_latent_dim, mlp_input_dim, mlp_output_dim)
  36. model.to(device)
  37. # Step 3: Convert your training data to PyTorch tensors
  38. x_cell_train_tensor = torch.Tensor(x_cell_train.values)
  39. x_drug_train_tensor = torch.Tensor(x_drug_train.values)
  40. x_cell_train_tensor = torch.nn.functional.normalize(x_cell_train_tensor, dim=0)
  41. x_drug_train_tensor = torch.nn.functional.normalize(x_drug_train_tensor, dim=0)
  42. y_train_tensor = torch.Tensor(y_train)
  43. y_train_tensor = y_train_tensor.unsqueeze(1)
  44. x_cell_train_tensor.to(device)
  45. x_drug_train_tensor.to(device)
  46. y_train_tensor.to(device)
  47. # Compute class weights
  48. classes = [0, 1] # Assuming binary classification
  49. class_weights = torch.tensor(compute_class_weight(class_weight='balanced', classes=classes, y=y_train),
  50. dtype=torch.float32)
  51. x_cell_train_tensor, x_cell_val_tensor, x_drug_train_tensor, x_drug_val_tensor, y_train_tensor, y_val_tensor = train_test_split(
  52. x_cell_train_tensor, x_drug_train_tensor, y_train_tensor, test_size=0.1,
  53. random_state=RANDOM_SEED,
  54. shuffle=True)
  55. # Step 4: Create a TensorDataset with the input features and target labels
  56. train_dataset = TensorDataset(x_cell_train_tensor, x_drug_train_tensor, y_train_tensor)
  57. val_dataset = TensorDataset(x_cell_val_tensor, x_drug_val_tensor, y_val_tensor)
  58. # Step 5: Create the train_loader
  59. train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
  60. val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
  61. # Step 6: Train the model
  62. train(model, train_loader, val_loader, num_epochs,class_weights)
  63. # Step 7: Save the trained model
  64. torch.save(model, 'DeepDRA.pth')
  65. # Step 8: Load the saved model
  66. model = torch.load('DeepDRA.pth')
  67. # Step 9: Convert your test data to PyTorch tensors
  68. x_cell_test_tensor = torch.Tensor(x_cell_test.values)
  69. x_drug_test_tensor = torch.Tensor(x_drug_test.values)
  70. y_test_tensor = torch.Tensor(y_test)
  71. # normalize data
  72. x_cell_test_tensor = torch.nn.functional.normalize(x_cell_test_tensor, dim=0)
  73. x_drug_test_tensor = torch.nn.functional.normalize(x_drug_test_tensor, dim=0)
  74. # Step 10: Create a TensorDataset with the input features and target labels for testing
  75. test_dataset = TensorDataset(x_cell_test_tensor, x_drug_test_tensor, y_test_tensor)
  76. test_loader = DataLoader(test_dataset, batch_size=len(x_cell_test))
  77. # Step 11: Test the model
  78. return test(model, test_loader)
  79. def run(k, is_test=False):
  80. """
  81. Run the training and evaluation process k times.
  82. Parameters:
  83. - k (int): Number of times to run the process.
  84. - is_test (bool): If True, run on test data; otherwise, perform train-validation split.
  85. Returns:
  86. - history (dict): Dictionary containing evaluation metrics for each run.
  87. """
  88. device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
  89. # Step 1: Initialize a dictionary to store evaluation metrics
  90. history = {'AUC': [], 'AUPRC': [], "Accuracy": [], "Precision": [], "Recall": [], "F1 score": []}
  91. # Step 2: Load training data
  92. train_data, train_drug_screen = RawDataLoader.load_data(data_modalities=DATA_MODALITIES,
  93. raw_file_directory=RAW_BOTH_DATA_FOLDER,
  94. screen_file_directory=BOTH_SCREENING_DATA_FOLDER,
  95. sep="\t")
  96. # Step 3: Load test data if applicable
  97. if is_test:
  98. test_data, test_drug_screen = RawDataLoader.load_data(data_modalities=DATA_MODALITIES,
  99. raw_file_directory=CCLE_RAW_DATA_FOLDER,
  100. screen_file_directory=CCLE_SCREENING_DATA_FOLDER,
  101. sep="\t")
  102. train_data, test_data = RawDataLoader.data_features_intersect(train_data, test_data)
  103. X_cell_test, X_drug_test, y_test, cell_sizes, drug_sizes = RawDataLoader.prepare_input_data(test_data,
  104. test_drug_screen)
  105. # Step 4: Prepare input data for training
  106. X_cell_train, X_drug_train, y_train, cell_sizes, drug_sizes = RawDataLoader.prepare_input_data(train_data,
  107. train_drug_screen)
  108. rus = RandomUnderSampler(sampling_strategy="majority", random_state=RANDOM_SEED)
  109. dataset = pd.concat([X_cell_train, X_drug_train], axis=1)
  110. dataset.index = X_cell_train.index
  111. dataset, y_train = rus.fit_resample(dataset, y_train)
  112. X_cell_train = dataset.iloc[:, :sum(cell_sizes)]
  113. X_drug_train = dataset.iloc[:, sum(cell_sizes):]
  114. # Step 5: Loop over k runs
  115. for i in range(k):
  116. print('Run {}'.format(i))
  117. # Step 6: If is_test is True, perform random under-sampling on the training data
  118. if is_test:
  119. # Step 7: Train and evaluate the DeepDRA model on test data
  120. results = train_DeepDRA(X_cell_train, X_cell_test, X_drug_train, X_drug_test, y_train, y_test, cell_sizes,
  121. drug_sizes, device)
  122. else:
  123. # Step 8: Split the data into training and validation sets
  124. X_cell_train, X_cell_test, X_drug_train, X_drug_test, y_train, y_test = train_test_split(X_cell_train,
  125. X_drug_train, y_train,
  126. test_size=0.2,
  127. random_state=RANDOM_SEED,
  128. shuffle=True)
  129. # Step 9: Train and evaluate the DeepDRA model on the split data
  130. results = train_DeepDRA(X_cell_train, X_cell_test, X_drug_train, X_drug_test, y_train, y_test, cell_sizes,
  131. drug_sizes, device)
  132. # Step 10: Add results to the history dictionary
  133. Evaluation.add_results(history, results)
  134. # Step 11: Display final results
  135. Evaluation.show_final_results(history)
  136. return history
  137. if __name__ == '__main__':
  138. torch.manual_seed(RANDOM_SEED)
  139. random.seed(RANDOM_SEED)
  140. np.random.seed(RANDOM_SEED)
  141. run(10, is_test=False)