return torch.square(w).sum() | return torch.square(w).sum() | ||||
def train(model, train_loader, num_epochs): | |||||
def train(model, train_loader, val_loader, num_epochs,class_weights): | |||||
""" | """ | ||||
Trains the DeepDRA (Deep Drug Response Anticipation) model. | Trains the DeepDRA (Deep Drug Response Anticipation) model. | ||||
- train_loader (DataLoader): DataLoader for the training dataset. | - train_loader (DataLoader): DataLoader for the training dataset. | ||||
- num_epochs (int): Number of training epochs. | - num_epochs (int): Number of training epochs. | ||||
""" | """ | ||||
autoencoder_loss_fn = nn.MSELoss() | autoencoder_loss_fn = nn.MSELoss() | ||||
mlp_loss_fn = nn.BCELoss() | mlp_loss_fn = nn.BCELoss() | ||||
mlp_optimizer = optim.Adam(model.parameters(), lr=0.0005) | |||||
train_accuracies = [] | |||||
val_accuracies = [] | |||||
train_loss = [] | |||||
val_loss = [] | |||||
mlp_optimizer = optim.Adam(model.parameters(), lr=0.0005,) | |||||
scheduler = lr_scheduler.ReduceLROnPlateau(mlp_optimizer, mode='min', factor=0.8, patience=5, verbose=True) | scheduler = lr_scheduler.ReduceLROnPlateau(mlp_optimizer, mode='min', factor=0.8, patience=5, verbose=True) | ||||
# Define weight parameters for each loss term | |||||
cell_ae_weight = 1.0 | |||||
drug_ae_weight = 1.0 | |||||
mlp_weight = 1.0 | |||||
for epoch in range(num_epochs): | for epoch in range(num_epochs): | ||||
model.train() | |||||
total_train_loss = 0.0 | |||||
train_correct = 0 | |||||
train_total_samples = 0 | |||||
for batch_idx, (cell_data, drug_data, target) in enumerate(train_loader): | for batch_idx, (cell_data, drug_data, target) in enumerate(train_loader): | ||||
mlp_optimizer.zero_grad() | mlp_optimizer.zero_grad() | ||||
# Forward pass | # Forward pass | ||||
cell_decoded_output, drug_decoded_output, mlp_output = model(cell_data, drug_data) | cell_decoded_output, drug_decoded_output, mlp_output = model(cell_data, drug_data) | ||||
# Compute class weights for the current batch | |||||
# batch_class_weights = class_weights[target.long()] | |||||
# mlp_loss_fn = nn.BCEWithLogitsLoss(weight=batch_class_weights) | |||||
# Compute losses | # Compute losses | ||||
cell_ae_loss = autoencoder_loss_fn(cell_decoded_output, cell_data) | |||||
drug_ae_loss = autoencoder_loss_fn(drug_decoded_output, drug_data) | |||||
mlp_loss = mlp_loss_fn(mlp_output, target) | |||||
cell_ae_loss = cell_ae_weight * autoencoder_loss_fn(cell_decoded_output, cell_data) | |||||
drug_ae_loss = drug_ae_weight * autoencoder_loss_fn(drug_decoded_output, drug_data) | |||||
mlp_loss = mlp_weight * mlp_loss_fn(mlp_output, target) | |||||
# Total loss is the sum of autoencoder losses and MLP loss | # Total loss is the sum of autoencoder losses and MLP loss | ||||
total_loss = drug_ae_loss + cell_ae_loss + mlp_loss | total_loss = drug_ae_loss + cell_ae_loss + mlp_loss | ||||
# Backward pass and optimization | # Backward pass and optimization | ||||
total_loss.backward() | total_loss.backward() | ||||
mlp_optimizer.step() | mlp_optimizer.step() | ||||
total_train_loss += total_loss.item() | |||||
# Calculate accuracy | |||||
train_predictions = torch.round(mlp_output) | |||||
train_correct += (train_predictions == target).sum().item() | |||||
train_total_samples += target.size(0) | |||||
avg_train_loss = total_train_loss / len(train_loader) | |||||
train_loss.append(avg_train_loss) | |||||
# Validation | |||||
model.eval() | |||||
total_val_loss = 0.0 | |||||
correct = 0 | |||||
total_samples = 0 | |||||
with torch.no_grad(): | |||||
for val_batch_idx, (cell_data_val, drug_data_val, val_target) in enumerate(val_loader): | |||||
cell_decoded_output_val, drug_decoded_output_val, mlp_output_val = model(cell_data_val, drug_data_val) | |||||
# batch_class_weights = class_weights[val_target.long()] | |||||
# mlp_loss_fn = nn.BCEWithLogitsLoss(weight=batch_class_weights) | |||||
# Compute losses | |||||
cell_ae_loss_val = cell_ae_weight * autoencoder_loss_fn(cell_decoded_output_val, cell_data_val) | |||||
drug_ae_loss_val = drug_ae_weight * autoencoder_loss_fn(drug_decoded_output_val, drug_data_val) | |||||
mlp_loss_val = mlp_weight * mlp_loss_fn(mlp_output_val, val_target) | |||||
# Total loss is the sum of autoencoder losses and MLP loss | |||||
total_val_loss = drug_ae_loss_val + cell_ae_loss_val + mlp_loss_val | |||||
# Calculate accuracy | |||||
val_predictions = torch.round(mlp_output_val) | |||||
correct += (val_predictions == val_target).sum().item() | |||||
total_samples += val_target.size(0) | |||||
avg_val_loss = total_val_loss / len(val_loader) | |||||
val_loss.append(avg_val_loss) | |||||
train_accuracy = train_correct / train_total_samples | |||||
train_accuracies.append(train_accuracy) | |||||
val_accuracy = correct / total_samples | |||||
val_accuracies.append(val_accuracy) | |||||
# Print progress | |||||
if batch_idx % 200 == 0: | |||||
print('Epoch [{}/{}], Total Loss: {:.4f}'.format( | |||||
epoch + 1, num_epochs, total_loss.item())) | |||||
print( | |||||
'Epoch [{}/{}], Train Loss: {:.4f}, Val Loss: {:.4f}, Train Accuracy: {:.4f}, Val Accuracy: {:.4f}'.format( | |||||
epoch + 1, num_epochs, avg_train_loss, avg_val_loss, train_accuracy, | |||||
val_accuracy)) | |||||
# Learning rate scheduler step | # Learning rate scheduler step | ||||
scheduler.step(total_loss) | |||||
scheduler.step(total_train_loss) | |||||
# Save the trained model | # Save the trained model | ||||
torch.save(model.state_dict(), MODEL_FOLDER + 'DeepDRA.pth') | torch.save(model.state_dict(), MODEL_FOLDER + 'DeepDRA.pth') |
# DeepDRA | # DeepDRA | ||||
Data | |||||
Download data from this link: https://drive.google.com/drive/folders/1-PgwD7KN9ZxCYBhyGAs3ihlbKK7s9jiO?usp=sharing | |||||
Run | |||||
You can run the main code with different data sets |
# Step 2: Calculate and print AUC | # Step 2: Calculate and print AUC | ||||
fpr, tpr, thresholds = metrics.roc_curve(all_targets, mlp_output) | fpr, tpr, thresholds = metrics.roc_curve(all_targets, mlp_output) | ||||
auc = np.round(metrics.auc(fpr, tpr), 2) | |||||
auc = np.round(metrics.auc(fpr, tpr), 3) | |||||
# Step 3: Calculate and print AUPRC | # Step 3: Calculate and print AUPRC | ||||
precision, recall, thresholds = metrics.precision_recall_curve(all_targets, mlp_output) | precision, recall, thresholds = metrics.precision_recall_curve(all_targets, mlp_output) | ||||
auprc = np.round(metrics.auc(recall, precision), 2) | |||||
auprc = np.round(metrics.auc(recall, precision), 3) | |||||
# Step 4: Print accuracy, AUC, AUPRC, and confusion matrix | # Step 4: Print accuracy, AUC, AUPRC, and confusion matrix | ||||
accuracy = accuracy_score(all_targets, all_predictions) | accuracy = accuracy_score(all_targets, all_predictions) | ||||
avg_auc = np.mean(result_list['AUC']) | avg_auc = np.mean(result_list['AUC']) | ||||
avg_auprc = np.mean(result_list['AUPRC']) | avg_auprc = np.mean(result_list['AUPRC']) | ||||
std_auprc = np.std(result_list['AUPRC']) | std_auprc = np.std(result_list['AUPRC']) | ||||
avg_accuracy = np.mean(result_list['Accuracy']) | |||||
avg_precision = np.mean(result_list['Precision']) | |||||
avg_recal = np.mean(result_list['Recall']) | |||||
avg_f1score = np.mean(result_list['F1 score']) | |||||
print( | |||||
f'AVG: Accuracy: {avg_accuracy:.3f}, Precision: {avg_precision:.3f}, Recall: {avg_recal:.3f}, F1 score: {avg_f1score:.3f}, AUC: {avg_auc:.3f}, ,AUPRC: {avg_auprc:.3f}') | |||||
print(" Average AUC: {:.3f} \t Average AUPRC: {:.3f} \t Std AUPRC: {:.3f}".format(avg_auc, avg_auprc, std_auprc)) | print(" Average AUC: {:.3f} \t Average AUPRC: {:.3f} \t Std AUPRC: {:.3f}".format(avg_auc, avg_auprc, std_auprc)) |
from imblearn.under_sampling import RandomUnderSampler | from imblearn.under_sampling import RandomUnderSampler | ||||
from sklearn.model_selection import train_test_split | from sklearn.model_selection import train_test_split | ||||
from sklearn.utils.class_weight import compute_class_weight | |||||
from torch.utils.data import TensorDataset, DataLoader, SubsetRandomSampler | from torch.utils.data import TensorDataset, DataLoader, SubsetRandomSampler | ||||
ae_latent_dim = 50 | ae_latent_dim = 50 | ||||
mlp_input_dim = 2 * ae_latent_dim | mlp_input_dim = 2 * ae_latent_dim | ||||
mlp_output_dim = 1 | mlp_output_dim = 1 | ||||
num_epochs = 20 | |||||
num_epochs = 25 | |||||
model = DeepDRA(cell_sizes, drug_sizes, ae_latent_dim, ae_latent_dim, mlp_input_dim, mlp_output_dim) | model = DeepDRA(cell_sizes, drug_sizes, ae_latent_dim, ae_latent_dim, mlp_input_dim, mlp_output_dim) | ||||
# Step 3: Convert your training data to PyTorch tensors | # Step 3: Convert your training data to PyTorch tensors | ||||
y_train_tensor = torch.Tensor(y_train) | y_train_tensor = torch.Tensor(y_train) | ||||
y_train_tensor = y_train_tensor.unsqueeze(1) | y_train_tensor = y_train_tensor.unsqueeze(1) | ||||
# Compute class weights | |||||
classes = [0, 1] # Assuming binary classification | |||||
class_weights = torch.tensor(compute_class_weight(class_weight='balanced', classes=classes, y=y_train), | |||||
dtype=torch.float32) | |||||
x_cell_train_tensor, x_cell_val_tensor, x_drug_train_tensor, x_drug_val_tensor, y_train_tensor, y_val_tensor = train_test_split( | |||||
x_cell_train_tensor, x_drug_train_tensor, y_train_tensor, test_size=0.1, | |||||
random_state=RANDOM_SEED, | |||||
shuffle=True) | |||||
# Step 4: Create a TensorDataset with the input features and target labels | # Step 4: Create a TensorDataset with the input features and target labels | ||||
train_dataset = TensorDataset(x_cell_train_tensor, x_drug_train_tensor, y_train_tensor) | train_dataset = TensorDataset(x_cell_train_tensor, x_drug_train_tensor, y_train_tensor) | ||||
val_dataset = TensorDataset(x_cell_val_tensor, x_drug_val_tensor, y_val_tensor) | |||||
# Step 5: Create the train_loader | # Step 5: Create the train_loader | ||||
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) | train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) | ||||
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True) | |||||
# Step 6: Train the model | # Step 6: Train the model | ||||
train(model, train_loader, num_epochs=num_epochs) | |||||
train(model, train_loader, val_loader, num_epochs,class_weights) | |||||
# Step 7: Save the trained model | # Step 7: Save the trained model | ||||
torch.save(model, MODEL_FOLDER + 'DeepDRA.pth') | |||||
torch.save(model, 'DeepDRA.pth') | |||||
# Step 8: Load the saved model | # Step 8: Load the saved model | ||||
model = torch.load( MODEL_FOLDER + 'DeepDRA.pth') | |||||
model = torch.load('DeepDRA.pth') | |||||
# Step 9: Convert your test data to PyTorch tensors | # Step 9: Convert your test data to PyTorch tensors | ||||
x_cell_test_tensor = torch.Tensor(x_cell_test.values) | x_cell_test_tensor = torch.Tensor(x_cell_test.values) | ||||
# Step 2: Load training data | # Step 2: Load training data | ||||
train_data, train_drug_screen = RawDataLoader.load_data(data_modalities=DATA_MODALITIES, | train_data, train_drug_screen = RawDataLoader.load_data(data_modalities=DATA_MODALITIES, | ||||
raw_file_directory=GDSC_RAW_DATA_FOLDER, | |||||
screen_file_directory=GDSC_SCREENING_DATA_FOLDER, | |||||
raw_file_directory=RAW_BOTH_DATA_FOLDER, | |||||
screen_file_directory=BOTH_SCREENING_DATA_FOLDER, | |||||
sep="\t") | sep="\t") | ||||
# Step 3: Load test data if applicable | # Step 3: Load test data if applicable | ||||
X_cell_train, X_drug_train, y_train, cell_sizes, drug_sizes = RawDataLoader.prepare_input_data(train_data, | X_cell_train, X_drug_train, y_train, cell_sizes, drug_sizes = RawDataLoader.prepare_input_data(train_data, | ||||
train_drug_screen) | train_drug_screen) | ||||
rus = RandomUnderSampler(sampling_strategy="majority", random_state=RANDOM_SEED) | |||||
dataset = pd.concat([X_cell_train, X_drug_train], axis=1) | |||||
dataset.index = X_cell_train.index | |||||
dataset, y_train = rus.fit_resample(dataset, y_train) | |||||
X_cell_train = dataset.iloc[:, :sum(cell_sizes)] | |||||
X_drug_train = dataset.iloc[:, sum(cell_sizes):] | |||||
# Step 5: Loop over k runs | # Step 5: Loop over k runs | ||||
for i in range(k): | for i in range(k): | ||||
print('Run {}'.format(i)) | print('Run {}'.format(i)) | ||||
# Step 6: If is_test is True, perform random under-sampling on the training data | # Step 6: If is_test is True, perform random under-sampling on the training data | ||||
if is_test: | if is_test: | ||||
rus = RandomUnderSampler(sampling_strategy="majority", random_state=RANDOM_SEED) | |||||
dataset = pd.concat([X_cell_train, X_drug_train], axis=1) | |||||
dataset.index = X_cell_train.index | |||||
dataset, y_train = rus.fit_resample(dataset, y_train) | |||||
X_cell_train = dataset.iloc[:, :sum(cell_sizes)] | |||||
X_drug_train = dataset.iloc[:, sum(cell_sizes):] | |||||
# Step 7: Train and evaluate the DeepDRA model on test data | # Step 7: Train and evaluate the DeepDRA model on test data | ||||
results = train_DeepDRA(X_cell_train, X_cell_test, X_drug_train, X_drug_test, y_train, y_test, cell_sizes, | results = train_DeepDRA(X_cell_train, X_cell_test, X_drug_train, X_drug_test, y_train, y_test, cell_sizes, | ||||
X_cell_train, X_cell_test, X_drug_train, X_drug_test, y_train, y_test = train_test_split(X_cell_train, | X_cell_train, X_cell_test, X_drug_train, X_drug_test, y_train, y_test = train_test_split(X_cell_train, | ||||
X_drug_train, y_train, | X_drug_train, y_train, | ||||
test_size=0.2, | test_size=0.2, | ||||
random_state=44, | |||||
random_state=RANDOM_SEED, | |||||
shuffle=True) | shuffle=True) | ||||
# Step 9: Train and evaluate the DeepDRA model on the split data | # Step 9: Train and evaluate the DeepDRA model on the split data | ||||
results = train_DeepDRA(X_cell_train, X_cell_test, X_drug_train, X_drug_test, y_train, y_test, cell_sizes, | results = train_DeepDRA(X_cell_train, X_cell_test, X_drug_train, X_drug_test, y_train, y_test, cell_sizes, |
DRUG_DATA_FOLDER = os.path.join(DATA_FOLDER, 'drug_data') | DRUG_DATA_FOLDER = os.path.join(DATA_FOLDER, 'drug_data') | ||||
GDSC_RAW_DATA_FOLDER = os.path.join(DATA_FOLDER, 'GDSC_data') | GDSC_RAW_DATA_FOLDER = os.path.join(DATA_FOLDER, 'GDSC_data') | ||||
CCLE_RAW_DATA_FOLDER = os.path.join(DATA_FOLDER, 'CCLE_data') | CCLE_RAW_DATA_FOLDER = os.path.join(DATA_FOLDER, 'CCLE_data') | ||||
CTRP_RAW_DATA_FOLDER = os.path.join(DATA_FOLDER, 'CTRP_data') | |||||
GDSC_SCREENING_DATA_FOLDER = os.path.join(GDSC_RAW_DATA_FOLDER, 'drug_screening_matrix_GDSC.tsv') | GDSC_SCREENING_DATA_FOLDER = os.path.join(GDSC_RAW_DATA_FOLDER, 'drug_screening_matrix_GDSC.tsv') | ||||
CCLE_SCREENING_DATA_FOLDER = os.path.join(CCLE_RAW_DATA_FOLDER, 'drug_screening_matrix_ccle.tsv') | CCLE_SCREENING_DATA_FOLDER = os.path.join(CCLE_RAW_DATA_FOLDER, 'drug_screening_matrix_ccle.tsv') | ||||
CTRP_SCREENING_DATA_FOLDER = os.path.join(CTRP_RAW_DATA_FOLDER, 'drug_screening_matrix_ctrp.tsv') | |||||
BOTH_SCREENING_DATA_FOLDER = os.path.join(RAW_BOTH_DATA_FOLDER, 'drug_screening_matrix_gdsc_ctrp.tsv') | BOTH_SCREENING_DATA_FOLDER = os.path.join(RAW_BOTH_DATA_FOLDER, 'drug_screening_matrix_gdsc_ctrp.tsv') | ||||
CTRP_FOLDER = os.path.join(DATA_FOLDER, 'CTRP') | CTRP_FOLDER = os.path.join(DATA_FOLDER, 'CTRP') | ||||
SAVE_MODEL = False # Change it to True to save the trained model | SAVE_MODEL = False # Change it to True to save the trained model | ||||
VARIATIONAL_AUTOENCODERS = False | VARIATIONAL_AUTOENCODERS = False | ||||
# DATA_MODALITIES=['cell_CN','cell_exp','cell_methy','cell_mut','drug_comp','drug_DT'] # Change this list to only consider specific data modalities | # DATA_MODALITIES=['cell_CN','cell_exp','cell_methy','cell_mut','drug_comp','drug_DT'] # Change this list to only consider specific data modalities | ||||
DATA_MODALITIES = ['cell_mut', 'drug_desc', 'drug_finger'] | |||||
DATA_MODALITIES = ['cell_CN','cell_exp','cell_mut', 'drug_desc'] | |||||
RANDOM_SEED = 42 # Must be used wherever can be used | RANDOM_SEED = 42 # Must be used wherever can be used | ||||