| return torch.square(w).sum() | return torch.square(w).sum() | ||||
| def train(model, train_loader, num_epochs): | |||||
| def train(model, train_loader, val_loader, num_epochs,class_weights): | |||||
| """ | """ | ||||
| Trains the DeepDRA (Deep Drug Response Anticipation) model. | Trains the DeepDRA (Deep Drug Response Anticipation) model. | ||||
| - train_loader (DataLoader): DataLoader for the training dataset. | - train_loader (DataLoader): DataLoader for the training dataset. | ||||
| - num_epochs (int): Number of training epochs. | - num_epochs (int): Number of training epochs. | ||||
| """ | """ | ||||
| autoencoder_loss_fn = nn.MSELoss() | autoencoder_loss_fn = nn.MSELoss() | ||||
| mlp_loss_fn = nn.BCELoss() | mlp_loss_fn = nn.BCELoss() | ||||
| mlp_optimizer = optim.Adam(model.parameters(), lr=0.0005) | |||||
| train_accuracies = [] | |||||
| val_accuracies = [] | |||||
| train_loss = [] | |||||
| val_loss = [] | |||||
| mlp_optimizer = optim.Adam(model.parameters(), lr=0.0005,) | |||||
| scheduler = lr_scheduler.ReduceLROnPlateau(mlp_optimizer, mode='min', factor=0.8, patience=5, verbose=True) | scheduler = lr_scheduler.ReduceLROnPlateau(mlp_optimizer, mode='min', factor=0.8, patience=5, verbose=True) | ||||
| # Define weight parameters for each loss term | |||||
| cell_ae_weight = 1.0 | |||||
| drug_ae_weight = 1.0 | |||||
| mlp_weight = 1.0 | |||||
| for epoch in range(num_epochs): | for epoch in range(num_epochs): | ||||
| model.train() | |||||
| total_train_loss = 0.0 | |||||
| train_correct = 0 | |||||
| train_total_samples = 0 | |||||
| for batch_idx, (cell_data, drug_data, target) in enumerate(train_loader): | for batch_idx, (cell_data, drug_data, target) in enumerate(train_loader): | ||||
| mlp_optimizer.zero_grad() | mlp_optimizer.zero_grad() | ||||
| # Forward pass | # Forward pass | ||||
| cell_decoded_output, drug_decoded_output, mlp_output = model(cell_data, drug_data) | cell_decoded_output, drug_decoded_output, mlp_output = model(cell_data, drug_data) | ||||
| # Compute class weights for the current batch | |||||
| # batch_class_weights = class_weights[target.long()] | |||||
| # mlp_loss_fn = nn.BCEWithLogitsLoss(weight=batch_class_weights) | |||||
| # Compute losses | # Compute losses | ||||
| cell_ae_loss = autoencoder_loss_fn(cell_decoded_output, cell_data) | |||||
| drug_ae_loss = autoencoder_loss_fn(drug_decoded_output, drug_data) | |||||
| mlp_loss = mlp_loss_fn(mlp_output, target) | |||||
| cell_ae_loss = cell_ae_weight * autoencoder_loss_fn(cell_decoded_output, cell_data) | |||||
| drug_ae_loss = drug_ae_weight * autoencoder_loss_fn(drug_decoded_output, drug_data) | |||||
| mlp_loss = mlp_weight * mlp_loss_fn(mlp_output, target) | |||||
| # Total loss is the sum of autoencoder losses and MLP loss | # Total loss is the sum of autoencoder losses and MLP loss | ||||
| total_loss = drug_ae_loss + cell_ae_loss + mlp_loss | total_loss = drug_ae_loss + cell_ae_loss + mlp_loss | ||||
| # Backward pass and optimization | # Backward pass and optimization | ||||
| total_loss.backward() | total_loss.backward() | ||||
| mlp_optimizer.step() | mlp_optimizer.step() | ||||
| total_train_loss += total_loss.item() | |||||
| # Calculate accuracy | |||||
| train_predictions = torch.round(mlp_output) | |||||
| train_correct += (train_predictions == target).sum().item() | |||||
| train_total_samples += target.size(0) | |||||
| avg_train_loss = total_train_loss / len(train_loader) | |||||
| train_loss.append(avg_train_loss) | |||||
| # Validation | |||||
| model.eval() | |||||
| total_val_loss = 0.0 | |||||
| correct = 0 | |||||
| total_samples = 0 | |||||
| with torch.no_grad(): | |||||
| for val_batch_idx, (cell_data_val, drug_data_val, val_target) in enumerate(val_loader): | |||||
| cell_decoded_output_val, drug_decoded_output_val, mlp_output_val = model(cell_data_val, drug_data_val) | |||||
| # batch_class_weights = class_weights[val_target.long()] | |||||
| # mlp_loss_fn = nn.BCEWithLogitsLoss(weight=batch_class_weights) | |||||
| # Compute losses | |||||
| cell_ae_loss_val = cell_ae_weight * autoencoder_loss_fn(cell_decoded_output_val, cell_data_val) | |||||
| drug_ae_loss_val = drug_ae_weight * autoencoder_loss_fn(drug_decoded_output_val, drug_data_val) | |||||
| mlp_loss_val = mlp_weight * mlp_loss_fn(mlp_output_val, val_target) | |||||
| # Total loss is the sum of autoencoder losses and MLP loss | |||||
| total_val_loss = drug_ae_loss_val + cell_ae_loss_val + mlp_loss_val | |||||
| # Calculate accuracy | |||||
| val_predictions = torch.round(mlp_output_val) | |||||
| correct += (val_predictions == val_target).sum().item() | |||||
| total_samples += val_target.size(0) | |||||
| avg_val_loss = total_val_loss / len(val_loader) | |||||
| val_loss.append(avg_val_loss) | |||||
| train_accuracy = train_correct / train_total_samples | |||||
| train_accuracies.append(train_accuracy) | |||||
| val_accuracy = correct / total_samples | |||||
| val_accuracies.append(val_accuracy) | |||||
| # Print progress | |||||
| if batch_idx % 200 == 0: | |||||
| print('Epoch [{}/{}], Total Loss: {:.4f}'.format( | |||||
| epoch + 1, num_epochs, total_loss.item())) | |||||
| print( | |||||
| 'Epoch [{}/{}], Train Loss: {:.4f}, Val Loss: {:.4f}, Train Accuracy: {:.4f}, Val Accuracy: {:.4f}'.format( | |||||
| epoch + 1, num_epochs, avg_train_loss, avg_val_loss, train_accuracy, | |||||
| val_accuracy)) | |||||
| # Learning rate scheduler step | # Learning rate scheduler step | ||||
| scheduler.step(total_loss) | |||||
| scheduler.step(total_train_loss) | |||||
| # Save the trained model | # Save the trained model | ||||
| torch.save(model.state_dict(), MODEL_FOLDER + 'DeepDRA.pth') | torch.save(model.state_dict(), MODEL_FOLDER + 'DeepDRA.pth') |
| # DeepDRA | # DeepDRA | ||||
| Data | |||||
| Download data from this link: https://drive.google.com/drive/folders/1-PgwD7KN9ZxCYBhyGAs3ihlbKK7s9jiO?usp=sharing | |||||
| Run | |||||
| You can run the main code with different data sets |
| # Step 2: Calculate and print AUC | # Step 2: Calculate and print AUC | ||||
| fpr, tpr, thresholds = metrics.roc_curve(all_targets, mlp_output) | fpr, tpr, thresholds = metrics.roc_curve(all_targets, mlp_output) | ||||
| auc = np.round(metrics.auc(fpr, tpr), 2) | |||||
| auc = np.round(metrics.auc(fpr, tpr), 3) | |||||
| # Step 3: Calculate and print AUPRC | # Step 3: Calculate and print AUPRC | ||||
| precision, recall, thresholds = metrics.precision_recall_curve(all_targets, mlp_output) | precision, recall, thresholds = metrics.precision_recall_curve(all_targets, mlp_output) | ||||
| auprc = np.round(metrics.auc(recall, precision), 2) | |||||
| auprc = np.round(metrics.auc(recall, precision), 3) | |||||
| # Step 4: Print accuracy, AUC, AUPRC, and confusion matrix | # Step 4: Print accuracy, AUC, AUPRC, and confusion matrix | ||||
| accuracy = accuracy_score(all_targets, all_predictions) | accuracy = accuracy_score(all_targets, all_predictions) | ||||
| avg_auc = np.mean(result_list['AUC']) | avg_auc = np.mean(result_list['AUC']) | ||||
| avg_auprc = np.mean(result_list['AUPRC']) | avg_auprc = np.mean(result_list['AUPRC']) | ||||
| std_auprc = np.std(result_list['AUPRC']) | std_auprc = np.std(result_list['AUPRC']) | ||||
| avg_accuracy = np.mean(result_list['Accuracy']) | |||||
| avg_precision = np.mean(result_list['Precision']) | |||||
| avg_recal = np.mean(result_list['Recall']) | |||||
| avg_f1score = np.mean(result_list['F1 score']) | |||||
| print( | |||||
| f'AVG: Accuracy: {avg_accuracy:.3f}, Precision: {avg_precision:.3f}, Recall: {avg_recal:.3f}, F1 score: {avg_f1score:.3f}, AUC: {avg_auc:.3f}, ,AUPRC: {avg_auprc:.3f}') | |||||
| print(" Average AUC: {:.3f} \t Average AUPRC: {:.3f} \t Std AUPRC: {:.3f}".format(avg_auc, avg_auprc, std_auprc)) | print(" Average AUC: {:.3f} \t Average AUPRC: {:.3f} \t Std AUPRC: {:.3f}".format(avg_auc, avg_auprc, std_auprc)) |
| from imblearn.under_sampling import RandomUnderSampler | from imblearn.under_sampling import RandomUnderSampler | ||||
| from sklearn.model_selection import train_test_split | from sklearn.model_selection import train_test_split | ||||
| from sklearn.utils.class_weight import compute_class_weight | |||||
| from torch.utils.data import TensorDataset, DataLoader, SubsetRandomSampler | from torch.utils.data import TensorDataset, DataLoader, SubsetRandomSampler | ||||
| ae_latent_dim = 50 | ae_latent_dim = 50 | ||||
| mlp_input_dim = 2 * ae_latent_dim | mlp_input_dim = 2 * ae_latent_dim | ||||
| mlp_output_dim = 1 | mlp_output_dim = 1 | ||||
| num_epochs = 20 | |||||
| num_epochs = 25 | |||||
| model = DeepDRA(cell_sizes, drug_sizes, ae_latent_dim, ae_latent_dim, mlp_input_dim, mlp_output_dim) | model = DeepDRA(cell_sizes, drug_sizes, ae_latent_dim, ae_latent_dim, mlp_input_dim, mlp_output_dim) | ||||
| # Step 3: Convert your training data to PyTorch tensors | # Step 3: Convert your training data to PyTorch tensors | ||||
| y_train_tensor = torch.Tensor(y_train) | y_train_tensor = torch.Tensor(y_train) | ||||
| y_train_tensor = y_train_tensor.unsqueeze(1) | y_train_tensor = y_train_tensor.unsqueeze(1) | ||||
| # Compute class weights | |||||
| classes = [0, 1] # Assuming binary classification | |||||
| class_weights = torch.tensor(compute_class_weight(class_weight='balanced', classes=classes, y=y_train), | |||||
| dtype=torch.float32) | |||||
| x_cell_train_tensor, x_cell_val_tensor, x_drug_train_tensor, x_drug_val_tensor, y_train_tensor, y_val_tensor = train_test_split( | |||||
| x_cell_train_tensor, x_drug_train_tensor, y_train_tensor, test_size=0.1, | |||||
| random_state=RANDOM_SEED, | |||||
| shuffle=True) | |||||
| # Step 4: Create a TensorDataset with the input features and target labels | # Step 4: Create a TensorDataset with the input features and target labels | ||||
| train_dataset = TensorDataset(x_cell_train_tensor, x_drug_train_tensor, y_train_tensor) | train_dataset = TensorDataset(x_cell_train_tensor, x_drug_train_tensor, y_train_tensor) | ||||
| val_dataset = TensorDataset(x_cell_val_tensor, x_drug_val_tensor, y_val_tensor) | |||||
| # Step 5: Create the train_loader | # Step 5: Create the train_loader | ||||
| train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) | train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) | ||||
| val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True) | |||||
| # Step 6: Train the model | # Step 6: Train the model | ||||
| train(model, train_loader, num_epochs=num_epochs) | |||||
| train(model, train_loader, val_loader, num_epochs,class_weights) | |||||
| # Step 7: Save the trained model | # Step 7: Save the trained model | ||||
| torch.save(model, MODEL_FOLDER + 'DeepDRA.pth') | |||||
| torch.save(model, 'DeepDRA.pth') | |||||
| # Step 8: Load the saved model | # Step 8: Load the saved model | ||||
| model = torch.load( MODEL_FOLDER + 'DeepDRA.pth') | |||||
| model = torch.load('DeepDRA.pth') | |||||
| # Step 9: Convert your test data to PyTorch tensors | # Step 9: Convert your test data to PyTorch tensors | ||||
| x_cell_test_tensor = torch.Tensor(x_cell_test.values) | x_cell_test_tensor = torch.Tensor(x_cell_test.values) | ||||
| # Step 2: Load training data | # Step 2: Load training data | ||||
| train_data, train_drug_screen = RawDataLoader.load_data(data_modalities=DATA_MODALITIES, | train_data, train_drug_screen = RawDataLoader.load_data(data_modalities=DATA_MODALITIES, | ||||
| raw_file_directory=GDSC_RAW_DATA_FOLDER, | |||||
| screen_file_directory=GDSC_SCREENING_DATA_FOLDER, | |||||
| raw_file_directory=RAW_BOTH_DATA_FOLDER, | |||||
| screen_file_directory=BOTH_SCREENING_DATA_FOLDER, | |||||
| sep="\t") | sep="\t") | ||||
| # Step 3: Load test data if applicable | # Step 3: Load test data if applicable | ||||
| X_cell_train, X_drug_train, y_train, cell_sizes, drug_sizes = RawDataLoader.prepare_input_data(train_data, | X_cell_train, X_drug_train, y_train, cell_sizes, drug_sizes = RawDataLoader.prepare_input_data(train_data, | ||||
| train_drug_screen) | train_drug_screen) | ||||
| rus = RandomUnderSampler(sampling_strategy="majority", random_state=RANDOM_SEED) | |||||
| dataset = pd.concat([X_cell_train, X_drug_train], axis=1) | |||||
| dataset.index = X_cell_train.index | |||||
| dataset, y_train = rus.fit_resample(dataset, y_train) | |||||
| X_cell_train = dataset.iloc[:, :sum(cell_sizes)] | |||||
| X_drug_train = dataset.iloc[:, sum(cell_sizes):] | |||||
| # Step 5: Loop over k runs | # Step 5: Loop over k runs | ||||
| for i in range(k): | for i in range(k): | ||||
| print('Run {}'.format(i)) | print('Run {}'.format(i)) | ||||
| # Step 6: If is_test is True, perform random under-sampling on the training data | # Step 6: If is_test is True, perform random under-sampling on the training data | ||||
| if is_test: | if is_test: | ||||
| rus = RandomUnderSampler(sampling_strategy="majority", random_state=RANDOM_SEED) | |||||
| dataset = pd.concat([X_cell_train, X_drug_train], axis=1) | |||||
| dataset.index = X_cell_train.index | |||||
| dataset, y_train = rus.fit_resample(dataset, y_train) | |||||
| X_cell_train = dataset.iloc[:, :sum(cell_sizes)] | |||||
| X_drug_train = dataset.iloc[:, sum(cell_sizes):] | |||||
| # Step 7: Train and evaluate the DeepDRA model on test data | # Step 7: Train and evaluate the DeepDRA model on test data | ||||
| results = train_DeepDRA(X_cell_train, X_cell_test, X_drug_train, X_drug_test, y_train, y_test, cell_sizes, | results = train_DeepDRA(X_cell_train, X_cell_test, X_drug_train, X_drug_test, y_train, y_test, cell_sizes, | ||||
| X_cell_train, X_cell_test, X_drug_train, X_drug_test, y_train, y_test = train_test_split(X_cell_train, | X_cell_train, X_cell_test, X_drug_train, X_drug_test, y_train, y_test = train_test_split(X_cell_train, | ||||
| X_drug_train, y_train, | X_drug_train, y_train, | ||||
| test_size=0.2, | test_size=0.2, | ||||
| random_state=44, | |||||
| random_state=RANDOM_SEED, | |||||
| shuffle=True) | shuffle=True) | ||||
| # Step 9: Train and evaluate the DeepDRA model on the split data | # Step 9: Train and evaluate the DeepDRA model on the split data | ||||
| results = train_DeepDRA(X_cell_train, X_cell_test, X_drug_train, X_drug_test, y_train, y_test, cell_sizes, | results = train_DeepDRA(X_cell_train, X_cell_test, X_drug_train, X_drug_test, y_train, y_test, cell_sizes, |
| DRUG_DATA_FOLDER = os.path.join(DATA_FOLDER, 'drug_data') | DRUG_DATA_FOLDER = os.path.join(DATA_FOLDER, 'drug_data') | ||||
| GDSC_RAW_DATA_FOLDER = os.path.join(DATA_FOLDER, 'GDSC_data') | GDSC_RAW_DATA_FOLDER = os.path.join(DATA_FOLDER, 'GDSC_data') | ||||
| CCLE_RAW_DATA_FOLDER = os.path.join(DATA_FOLDER, 'CCLE_data') | CCLE_RAW_DATA_FOLDER = os.path.join(DATA_FOLDER, 'CCLE_data') | ||||
| CTRP_RAW_DATA_FOLDER = os.path.join(DATA_FOLDER, 'CTRP_data') | |||||
| GDSC_SCREENING_DATA_FOLDER = os.path.join(GDSC_RAW_DATA_FOLDER, 'drug_screening_matrix_GDSC.tsv') | GDSC_SCREENING_DATA_FOLDER = os.path.join(GDSC_RAW_DATA_FOLDER, 'drug_screening_matrix_GDSC.tsv') | ||||
| CCLE_SCREENING_DATA_FOLDER = os.path.join(CCLE_RAW_DATA_FOLDER, 'drug_screening_matrix_ccle.tsv') | CCLE_SCREENING_DATA_FOLDER = os.path.join(CCLE_RAW_DATA_FOLDER, 'drug_screening_matrix_ccle.tsv') | ||||
| CTRP_SCREENING_DATA_FOLDER = os.path.join(CTRP_RAW_DATA_FOLDER, 'drug_screening_matrix_ctrp.tsv') | |||||
| BOTH_SCREENING_DATA_FOLDER = os.path.join(RAW_BOTH_DATA_FOLDER, 'drug_screening_matrix_gdsc_ctrp.tsv') | BOTH_SCREENING_DATA_FOLDER = os.path.join(RAW_BOTH_DATA_FOLDER, 'drug_screening_matrix_gdsc_ctrp.tsv') | ||||
| CTRP_FOLDER = os.path.join(DATA_FOLDER, 'CTRP') | CTRP_FOLDER = os.path.join(DATA_FOLDER, 'CTRP') | ||||
| SAVE_MODEL = False # Change it to True to save the trained model | SAVE_MODEL = False # Change it to True to save the trained model | ||||
| VARIATIONAL_AUTOENCODERS = False | VARIATIONAL_AUTOENCODERS = False | ||||
| # DATA_MODALITIES=['cell_CN','cell_exp','cell_methy','cell_mut','drug_comp','drug_DT'] # Change this list to only consider specific data modalities | # DATA_MODALITIES=['cell_CN','cell_exp','cell_methy','cell_mut','drug_comp','drug_DT'] # Change this list to only consider specific data modalities | ||||
| DATA_MODALITIES = ['cell_mut', 'drug_desc', 'drug_finger'] | |||||
| DATA_MODALITIES = ['cell_CN','cell_exp','cell_mut', 'drug_desc'] | |||||
| RANDOM_SEED = 42 # Must be used wherever can be used | RANDOM_SEED = 42 # Must be used wherever can be used | ||||