You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

main.py 10KB

11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229
  1. from imblearn.under_sampling import RandomUnderSampler
  2. from sklearn.model_selection import train_test_split
  3. from sklearn.utils.class_weight import compute_class_weight
  4. from torch.utils.data import TensorDataset, DataLoader, SubsetRandomSampler
  5. from sklearn.model_selection import KFold
  6. from DeepDRA import DeepDRA, train, test
  7. from data_loader import RawDataLoader
  8. from evaluation import Evaluation
  9. from utils import *
  10. import random
  11. import torch
  12. import numpy as np
  13. import pandas as pd
  14. # Step 1: Define the batch size for training
  15. batch_size = 64
  16. # Step 2: Instantiate the combined model
  17. ae_latent_dim = 50
  18. num_epochs = 25
  19. def train_DeepDRA(x_cell_train, x_cell_test, x_drug_train, x_drug_test, y_train, y_test, cell_sizes, drug_sizes,device):
  20. """
  21. Train and evaluate the DeepDRA model.
  22. Parameters:
  23. - X_cell_train (pd.DataFrame): Training data for the cell modality.
  24. - X_cell_test (pd.DataFrame): Test data for the cell modality.
  25. - X_drug_train (pd.DataFrame): Training data for the drug modality.
  26. - X_drug_test (pd.DataFrame): Test data for the drug modality.
  27. - y_train (pd.Series): Training labels.
  28. - y_test (pd.Series): Test labels.
  29. - cell_sizes (list): Sizes of the cell modality features.
  30. - drug_sizes (list): Sizes of the drug modality features.
  31. Returns:
  32. - result: Evaluation result on the test set.
  33. """
  34. model = DeepDRA(cell_sizes, drug_sizes, ae_latent_dim, ae_latent_dim, mlp_input_dim, mlp_output_dim)
  35. model= model.to(device)
  36. # Step 3: Convert your training data to PyTorch tensors
  37. x_cell_train_tensor = torch.Tensor(x_cell_train.values)
  38. x_drug_train_tensor = torch.Tensor(x_drug_train.values)
  39. x_cell_train_tensor = torch.nn.functional.normalize(x_cell_train_tensor, dim=0)
  40. x_drug_train_tensor = torch.nn.functional.normalize(x_drug_train_tensor, dim=0)
  41. y_train_tensor = torch.Tensor(y_train)
  42. y_train_tensor = y_train_tensor.unsqueeze(1)
  43. # Compute class weights
  44. classes = [0, 1] # Assuming binary classification
  45. class_weights = torch.tensor(compute_class_weight(class_weight='balanced', classes=classes, y=y_train),
  46. dtype=torch.float32)
  47. x_cell_train_tensor, x_cell_val_tensor, x_drug_train_tensor, x_drug_val_tensor, y_train_tensor, y_val_tensor = train_test_split(
  48. x_cell_train_tensor, x_drug_train_tensor, y_train_tensor, test_size=0.1,
  49. random_state=RANDOM_SEED,
  50. shuffle=True)
  51. # Step 4: Create a TensorDataset with the input features and target labels
  52. train_dataset = TensorDataset(x_cell_train_tensor.to(device), x_drug_train_tensor.to(device), y_train_tensor.to(device))
  53. val_dataset = TensorDataset(x_cell_val_tensor.to(device), x_drug_val_tensor.to(device), y_val_tensor.to(device))
  54. # Step 5: Create the train_loader
  55. train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
  56. val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
  57. # Step 6: Train the model
  58. train(model, train_loader, val_loader, num_epochs,class_weights)
  59. # Step 7: Save the trained model
  60. torch.save(model, 'DeepDRA.pth')
  61. # Step 8: Load the saved model
  62. model = torch.load('DeepDRA.pth')
  63. # Step 9: Convert your test data to PyTorch tensors
  64. x_cell_test_tensor = torch.Tensor(x_cell_test.values)
  65. x_drug_test_tensor = torch.Tensor(x_drug_test.values)
  66. y_test_tensor = torch.Tensor(y_test).to(device)
  67. # normalize data
  68. x_cell_test_tensor = torch.nn.functional.normalize(x_cell_test_tensor, dim=0).to(device)
  69. x_drug_test_tensor = torch.nn.functional.normalize(x_drug_test_tensor, dim=0).to(device)
  70. # Step 10: Create a TensorDataset with the input features and target labels for testing
  71. test_dataset = TensorDataset(x_cell_test_tensor, x_drug_test_tensor, y_test_tensor)
  72. test_loader = DataLoader(test_dataset, batch_size=len(x_cell_test))
  73. # Step 11: Test the model
  74. return test(model, test_loader)
  75. def cv_train(x_cell_train, x_drug_train, y_train, cell_sizes,
  76. drug_sizes, device, k=5, ):
  77. splits = KFold(n_splits=k, shuffle=True, random_state=RANDOM_SEED)
  78. history = {'AUC': [], 'AUPRC': [], "Accuracy": [], "Precision": [], "Recall": [], "F1 score": []}
  79. for fold, (train_idx, val_idx) in enumerate(splits.split(np.arange(len(x_cell_train)))):
  80. print('Fold {}'.format(fold + 1))
  81. train_sampler = SubsetRandomSampler(train_idx)
  82. test_sampler = SubsetRandomSampler(val_idx)
  83. model = DeepDRA(cell_sizes, drug_sizes, ae_latent_dim, ae_latent_dim)
  84. # Convert your training data to PyTorch tensors
  85. x_cell_train_tensor = torch.Tensor(x_cell_train.values)
  86. x_drug_train_tensor = torch.Tensor(x_drug_train.values)
  87. y_train_tensor = torch.Tensor(y_train)
  88. y_train_tensor = y_train_tensor.unsqueeze(1)
  89. # Compute class weights
  90. classes = [0, 1] # Assuming binary classification
  91. class_weights = torch.tensor(compute_class_weight(class_weight='balanced', classes=classes, y=y_train),
  92. dtype=torch.float32)
  93. # Create a TensorDataset with the input features and target labels
  94. train_dataset = TensorDataset(x_cell_train_tensor, x_drug_train_tensor, y_train_tensor)
  95. # Create the train_loader
  96. train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=train_sampler)
  97. # Train the model
  98. train(model, train_loader,train_loader, num_epochs, class_weights)
  99. # Create a TensorDataset with the input features and target labels
  100. test_loader = DataLoader(train_dataset, batch_size=len(x_cell_train), sampler=test_sampler)
  101. # Test the model
  102. results = test(model, test_loader)
  103. # Step 10: Add results to the history dictionary
  104. Evaluation.add_results(history, results)
  105. return Evaluation.show_final_results(history)
  106. def run(k, is_test=False ):
  107. """
  108. Run the training and evaluation process k times.
  109. Parameters:
  110. - k (int): Number of times to run the process.
  111. - is_test (bool): If True, run on test data; otherwise, perform train-validation split.
  112. Returns:
  113. - history (dict): Dictionary containing evaluation metrics for each run.
  114. """
  115. device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
  116. print(device)
  117. # Step 1: Initialize a dictionary to store evaluation metrics
  118. history = {'AUC': [], 'AUPRC': [], "Accuracy": [], "Precision": [], "Recall": [], "F1 score": []}
  119. # Step 2: Load training data
  120. train_data, train_drug_screen = RawDataLoader.load_data(data_modalities=DATA_MODALITIES,
  121. raw_file_directory=RAW_BOTH_DATA_FOLDER,
  122. screen_file_directory=BOTH_SCREENING_DATA_FOLDER,
  123. sep="\t")
  124. # Step 3: Load test data if applicable
  125. if is_test:
  126. test_data, test_drug_screen = RawDataLoader.load_data(data_modalities=DATA_MODALITIES,
  127. raw_file_directory=CCLE_RAW_DATA_FOLDER,
  128. screen_file_directory=CCLE_SCREENING_DATA_FOLDER,
  129. sep="\t")
  130. train_data, test_data = RawDataLoader.data_features_intersect(train_data, test_data)
  131. # Step 4: Prepare input data for training
  132. x_cell_train, x_drug_train, y_train, cell_sizes, drug_sizes = RawDataLoader.prepare_input_data(train_data,
  133. train_drug_screen)
  134. if is_test:
  135. x_cell_test, x_drug_test, y_test, cell_sizes, drug_sizes = RawDataLoader.prepare_input_data(test_data,
  136. test_drug_screen)
  137. rus = RandomUnderSampler(sampling_strategy="majority", random_state=RANDOM_SEED)
  138. dataset = pd.concat([x_cell_train, x_drug_train], axis=1)
  139. dataset.index = x_cell_train.index
  140. dataset, y_train = rus.fit_resample(dataset, y_train)
  141. x_cell_train = dataset.iloc[:, :sum(cell_sizes)]
  142. x_drug_train = dataset.iloc[:, sum(cell_sizes):]
  143. # Step 5: Loop over k runs
  144. for i in range(k):
  145. print('Run {}'.format(i))
  146. # Step 6: If is_test is True, perform random under-sampling on the training data
  147. if is_test:
  148. # Step 7: Train and evaluate the DeepDRA model on test data
  149. results = train_DeepDRA(x_cell_train, x_cell_test, x_drug_train, x_drug_test, y_train, y_test, cell_sizes,
  150. drug_sizes, device)
  151. else:
  152. # # Step 8: Split the data into training and validation sets
  153. # X_cell_train, X_cell_test, X_drug_train, X_drug_test, y_train, y_test = train_test_split(X_cell_train,
  154. # X_drug_train, y_train,
  155. # test_size=0.2,
  156. # random_state=RANDOM_SEED,
  157. # shuffle=True)
  158. # # Step 9: Train and evaluate the DeepDRA model on the split data
  159. # results = train_DeepDRA(X_cell_train, X_cell_test, X_drug_train, X_drug_test, y_train, y_test, cell_sizes,
  160. # drug_sizes, device)
  161. results = cv_train(x_cell_train, x_drug_train, y_train, cell_sizes, drug_sizes, device, k=5)
  162. # Step 10: Add results to the history dictionary
  163. Evaluation.add_results(history, results)
  164. # Step 11: Display final results
  165. Evaluation.show_final_results(history)
  166. return history
  167. if __name__ == '__main__':
  168. torch.manual_seed(RANDOM_SEED)
  169. random.seed(RANDOM_SEED)
  170. np.random.seed(RANDOM_SEED)
  171. run(10, is_test=True)