@@ -88,7 +88,7 @@ class DeepDRA(nn.Module): | |||
return torch.square(w).sum() | |||
def train(model, train_loader, num_epochs): | |||
def train(model, train_loader, val_loader, num_epochs,class_weights): | |||
""" | |||
Trains the DeepDRA (Deep Drug Response Anticipation) model. | |||
@@ -97,23 +97,44 @@ def train(model, train_loader, num_epochs): | |||
- train_loader (DataLoader): DataLoader for the training dataset. | |||
- num_epochs (int): Number of training epochs. | |||
""" | |||
autoencoder_loss_fn = nn.MSELoss() | |||
mlp_loss_fn = nn.BCELoss() | |||
mlp_optimizer = optim.Adam(model.parameters(), lr=0.0005) | |||
train_accuracies = [] | |||
val_accuracies = [] | |||
train_loss = [] | |||
val_loss = [] | |||
mlp_optimizer = optim.Adam(model.parameters(), lr=0.0005,) | |||
scheduler = lr_scheduler.ReduceLROnPlateau(mlp_optimizer, mode='min', factor=0.8, patience=5, verbose=True) | |||
# Define weight parameters for each loss term | |||
cell_ae_weight = 1.0 | |||
drug_ae_weight = 1.0 | |||
mlp_weight = 1.0 | |||
for epoch in range(num_epochs): | |||
model.train() | |||
total_train_loss = 0.0 | |||
train_correct = 0 | |||
train_total_samples = 0 | |||
for batch_idx, (cell_data, drug_data, target) in enumerate(train_loader): | |||
mlp_optimizer.zero_grad() | |||
# Forward pass | |||
cell_decoded_output, drug_decoded_output, mlp_output = model(cell_data, drug_data) | |||
# Compute class weights for the current batch | |||
# batch_class_weights = class_weights[target.long()] | |||
# mlp_loss_fn = nn.BCEWithLogitsLoss(weight=batch_class_weights) | |||
# Compute losses | |||
cell_ae_loss = autoencoder_loss_fn(cell_decoded_output, cell_data) | |||
drug_ae_loss = autoencoder_loss_fn(drug_decoded_output, drug_data) | |||
mlp_loss = mlp_loss_fn(mlp_output, target) | |||
cell_ae_loss = cell_ae_weight * autoencoder_loss_fn(cell_decoded_output, cell_data) | |||
drug_ae_loss = drug_ae_weight * autoencoder_loss_fn(drug_decoded_output, drug_data) | |||
mlp_loss = mlp_weight * mlp_loss_fn(mlp_output, target) | |||
# Total loss is the sum of autoencoder losses and MLP loss | |||
total_loss = drug_ae_loss + cell_ae_loss + mlp_loss | |||
@@ -121,14 +142,56 @@ def train(model, train_loader, num_epochs): | |||
# Backward pass and optimization | |||
total_loss.backward() | |||
mlp_optimizer.step() | |||
total_train_loss += total_loss.item() | |||
# Calculate accuracy | |||
train_predictions = torch.round(mlp_output) | |||
train_correct += (train_predictions == target).sum().item() | |||
train_total_samples += target.size(0) | |||
avg_train_loss = total_train_loss / len(train_loader) | |||
train_loss.append(avg_train_loss) | |||
# Validation | |||
model.eval() | |||
total_val_loss = 0.0 | |||
correct = 0 | |||
total_samples = 0 | |||
with torch.no_grad(): | |||
for val_batch_idx, (cell_data_val, drug_data_val, val_target) in enumerate(val_loader): | |||
cell_decoded_output_val, drug_decoded_output_val, mlp_output_val = model(cell_data_val, drug_data_val) | |||
# batch_class_weights = class_weights[val_target.long()] | |||
# mlp_loss_fn = nn.BCEWithLogitsLoss(weight=batch_class_weights) | |||
# Compute losses | |||
cell_ae_loss_val = cell_ae_weight * autoencoder_loss_fn(cell_decoded_output_val, cell_data_val) | |||
drug_ae_loss_val = drug_ae_weight * autoencoder_loss_fn(drug_decoded_output_val, drug_data_val) | |||
mlp_loss_val = mlp_weight * mlp_loss_fn(mlp_output_val, val_target) | |||
# Total loss is the sum of autoencoder losses and MLP loss | |||
total_val_loss = drug_ae_loss_val + cell_ae_loss_val + mlp_loss_val | |||
# Calculate accuracy | |||
val_predictions = torch.round(mlp_output_val) | |||
correct += (val_predictions == val_target).sum().item() | |||
total_samples += val_target.size(0) | |||
avg_val_loss = total_val_loss / len(val_loader) | |||
val_loss.append(avg_val_loss) | |||
train_accuracy = train_correct / train_total_samples | |||
train_accuracies.append(train_accuracy) | |||
val_accuracy = correct / total_samples | |||
val_accuracies.append(val_accuracy) | |||
# Print progress | |||
if batch_idx % 200 == 0: | |||
print('Epoch [{}/{}], Total Loss: {:.4f}'.format( | |||
epoch + 1, num_epochs, total_loss.item())) | |||
print( | |||
'Epoch [{}/{}], Train Loss: {:.4f}, Val Loss: {:.4f}, Train Accuracy: {:.4f}, Val Accuracy: {:.4f}'.format( | |||
epoch + 1, num_epochs, avg_train_loss, avg_val_loss, train_accuracy, | |||
val_accuracy)) | |||
# Learning rate scheduler step | |||
scheduler.step(total_loss) | |||
scheduler.step(total_train_loss) | |||
# Save the trained model | |||
torch.save(model.state_dict(), MODEL_FOLDER + 'DeepDRA.pth') |
@@ -1 +1,8 @@ | |||
# DeepDRA | |||
Data | |||
Download data from this link: https://drive.google.com/drive/folders/1-PgwD7KN9ZxCYBhyGAs3ihlbKK7s9jiO?usp=sharing | |||
Run | |||
You can run the main code with different data sets |
@@ -72,11 +72,11 @@ class Evaluation: | |||
# Step 2: Calculate and print AUC | |||
fpr, tpr, thresholds = metrics.roc_curve(all_targets, mlp_output) | |||
auc = np.round(metrics.auc(fpr, tpr), 2) | |||
auc = np.round(metrics.auc(fpr, tpr), 3) | |||
# Step 3: Calculate and print AUPRC | |||
precision, recall, thresholds = metrics.precision_recall_curve(all_targets, mlp_output) | |||
auprc = np.round(metrics.auc(recall, precision), 2) | |||
auprc = np.round(metrics.auc(recall, precision), 3) | |||
# Step 4: Print accuracy, AUC, AUPRC, and confusion matrix | |||
accuracy = accuracy_score(all_targets, all_predictions) | |||
@@ -162,4 +162,11 @@ class Evaluation: | |||
avg_auc = np.mean(result_list['AUC']) | |||
avg_auprc = np.mean(result_list['AUPRC']) | |||
std_auprc = np.std(result_list['AUPRC']) | |||
avg_accuracy = np.mean(result_list['Accuracy']) | |||
avg_precision = np.mean(result_list['Precision']) | |||
avg_recal = np.mean(result_list['Recall']) | |||
avg_f1score = np.mean(result_list['F1 score']) | |||
print( | |||
f'AVG: Accuracy: {avg_accuracy:.3f}, Precision: {avg_precision:.3f}, Recall: {avg_recal:.3f}, F1 score: {avg_f1score:.3f}, AUC: {avg_auc:.3f}, ,AUPRC: {avg_auprc:.3f}') | |||
print(" Average AUC: {:.3f} \t Average AUPRC: {:.3f} \t Std AUPRC: {:.3f}".format(avg_auc, avg_auprc, std_auprc)) |
@@ -1,5 +1,6 @@ | |||
from imblearn.under_sampling import RandomUnderSampler | |||
from sklearn.model_selection import train_test_split | |||
from sklearn.utils.class_weight import compute_class_weight | |||
from torch.utils.data import TensorDataset, DataLoader, SubsetRandomSampler | |||
@@ -39,7 +40,7 @@ def train_DeepDRA(x_cell_train, x_cell_test, x_drug_train, x_drug_test, y_train, | |||
ae_latent_dim = 50 | |||
mlp_input_dim = 2 * ae_latent_dim | |||
mlp_output_dim = 1 | |||
num_epochs = 20 | |||
num_epochs = 25 | |||
model = DeepDRA(cell_sizes, drug_sizes, ae_latent_dim, ae_latent_dim, mlp_input_dim, mlp_output_dim) | |||
# Step 3: Convert your training data to PyTorch tensors | |||
@@ -50,20 +51,34 @@ def train_DeepDRA(x_cell_train, x_cell_test, x_drug_train, x_drug_test, y_train, | |||
y_train_tensor = torch.Tensor(y_train) | |||
y_train_tensor = y_train_tensor.unsqueeze(1) | |||
# Compute class weights | |||
classes = [0, 1] # Assuming binary classification | |||
class_weights = torch.tensor(compute_class_weight(class_weight='balanced', classes=classes, y=y_train), | |||
dtype=torch.float32) | |||
x_cell_train_tensor, x_cell_val_tensor, x_drug_train_tensor, x_drug_val_tensor, y_train_tensor, y_val_tensor = train_test_split( | |||
x_cell_train_tensor, x_drug_train_tensor, y_train_tensor, test_size=0.1, | |||
random_state=RANDOM_SEED, | |||
shuffle=True) | |||
# Step 4: Create a TensorDataset with the input features and target labels | |||
train_dataset = TensorDataset(x_cell_train_tensor, x_drug_train_tensor, y_train_tensor) | |||
val_dataset = TensorDataset(x_cell_val_tensor, x_drug_val_tensor, y_val_tensor) | |||
# Step 5: Create the train_loader | |||
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) | |||
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True) | |||
# Step 6: Train the model | |||
train(model, train_loader, num_epochs=num_epochs) | |||
train(model, train_loader, val_loader, num_epochs,class_weights) | |||
# Step 7: Save the trained model | |||
torch.save(model, MODEL_FOLDER + 'DeepDRA.pth') | |||
torch.save(model, 'DeepDRA.pth') | |||
# Step 8: Load the saved model | |||
model = torch.load( MODEL_FOLDER + 'DeepDRA.pth') | |||
model = torch.load('DeepDRA.pth') | |||
# Step 9: Convert your test data to PyTorch tensors | |||
x_cell_test_tensor = torch.Tensor(x_cell_test.values) | |||
@@ -99,8 +114,8 @@ def run(k, is_test=False): | |||
# Step 2: Load training data | |||
train_data, train_drug_screen = RawDataLoader.load_data(data_modalities=DATA_MODALITIES, | |||
raw_file_directory=GDSC_RAW_DATA_FOLDER, | |||
screen_file_directory=GDSC_SCREENING_DATA_FOLDER, | |||
raw_file_directory=RAW_BOTH_DATA_FOLDER, | |||
screen_file_directory=BOTH_SCREENING_DATA_FOLDER, | |||
sep="\t") | |||
# Step 3: Load test data if applicable | |||
@@ -117,18 +132,19 @@ def run(k, is_test=False): | |||
X_cell_train, X_drug_train, y_train, cell_sizes, drug_sizes = RawDataLoader.prepare_input_data(train_data, | |||
train_drug_screen) | |||
rus = RandomUnderSampler(sampling_strategy="majority", random_state=RANDOM_SEED) | |||
dataset = pd.concat([X_cell_train, X_drug_train], axis=1) | |||
dataset.index = X_cell_train.index | |||
dataset, y_train = rus.fit_resample(dataset, y_train) | |||
X_cell_train = dataset.iloc[:, :sum(cell_sizes)] | |||
X_drug_train = dataset.iloc[:, sum(cell_sizes):] | |||
# Step 5: Loop over k runs | |||
for i in range(k): | |||
print('Run {}'.format(i)) | |||
# Step 6: If is_test is True, perform random under-sampling on the training data | |||
if is_test: | |||
rus = RandomUnderSampler(sampling_strategy="majority", random_state=RANDOM_SEED) | |||
dataset = pd.concat([X_cell_train, X_drug_train], axis=1) | |||
dataset.index = X_cell_train.index | |||
dataset, y_train = rus.fit_resample(dataset, y_train) | |||
X_cell_train = dataset.iloc[:, :sum(cell_sizes)] | |||
X_drug_train = dataset.iloc[:, sum(cell_sizes):] | |||
# Step 7: Train and evaluate the DeepDRA model on test data | |||
results = train_DeepDRA(X_cell_train, X_cell_test, X_drug_train, X_drug_test, y_train, y_test, cell_sizes, | |||
@@ -138,7 +154,7 @@ def run(k, is_test=False): | |||
X_cell_train, X_cell_test, X_drug_train, X_drug_test, y_train, y_test = train_test_split(X_cell_train, | |||
X_drug_train, y_train, | |||
test_size=0.2, | |||
random_state=44, | |||
random_state=RANDOM_SEED, | |||
shuffle=True) | |||
# Step 9: Train and evaluate the DeepDRA model on the split data | |||
results = train_DeepDRA(X_cell_train, X_cell_test, X_drug_train, X_drug_test, y_train, y_test, cell_sizes, |
@@ -7,8 +7,11 @@ RAW_BOTH_DATA_FOLDER = os.path.join(DATA_FOLDER, 'CTRP_GDSC_data') | |||
DRUG_DATA_FOLDER = os.path.join(DATA_FOLDER, 'drug_data') | |||
GDSC_RAW_DATA_FOLDER = os.path.join(DATA_FOLDER, 'GDSC_data') | |||
CCLE_RAW_DATA_FOLDER = os.path.join(DATA_FOLDER, 'CCLE_data') | |||
CTRP_RAW_DATA_FOLDER = os.path.join(DATA_FOLDER, 'CTRP_data') | |||
GDSC_SCREENING_DATA_FOLDER = os.path.join(GDSC_RAW_DATA_FOLDER, 'drug_screening_matrix_GDSC.tsv') | |||
CCLE_SCREENING_DATA_FOLDER = os.path.join(CCLE_RAW_DATA_FOLDER, 'drug_screening_matrix_ccle.tsv') | |||
CTRP_SCREENING_DATA_FOLDER = os.path.join(CTRP_RAW_DATA_FOLDER, 'drug_screening_matrix_ctrp.tsv') | |||
BOTH_SCREENING_DATA_FOLDER = os.path.join(RAW_BOTH_DATA_FOLDER, 'drug_screening_matrix_gdsc_ctrp.tsv') | |||
CTRP_FOLDER = os.path.join(DATA_FOLDER, 'CTRP') | |||
@@ -27,7 +30,7 @@ SIM_KERNEL = {'cell_CN': ('euclidean', 0.001), 'cell_exp': ('euclidean', 0.01), | |||
SAVE_MODEL = False # Change it to True to save the trained model | |||
VARIATIONAL_AUTOENCODERS = False | |||
# DATA_MODALITIES=['cell_CN','cell_exp','cell_methy','cell_mut','drug_comp','drug_DT'] # Change this list to only consider specific data modalities | |||
DATA_MODALITIES = ['cell_mut', 'drug_desc', 'drug_finger'] | |||
DATA_MODALITIES = ['cell_CN','cell_exp','cell_mut', 'drug_desc'] | |||
RANDOM_SEED = 42 # Must be used wherever can be used | |||