@@ -0,0 +1,173 @@ | |||
import torch.nn as nn | |||
import torch | |||
import torch.optim as optim | |||
from torch.optim import lr_scheduler | |||
from autoencoder import Autoencoder | |||
from evaluation import Evaluation | |||
from mlp import MLP | |||
from utils import * | |||
class DeepDRA(nn.Module): | |||
""" | |||
DeepDRA (Deep Drug Response Anticipation) is a neural network model composed of two autoencoders for cell and drug modalities | |||
and an MLP for integrating the encoded features and making predictions. | |||
Parameters: | |||
- cell_modality_sizes (list): Sizes of the cell modality features. | |||
- drug_modality_sizes (list): Sizes of the drug modality features. | |||
- cell_ae_latent_dim (int): Latent dimension for the cell autoencoder. | |||
- drug_ae_latent_dim (int): Latent dimension for the drug autoencoder. | |||
- mlp_input_dim (int): Input dimension for the MLP. | |||
- mlp_output_dim (int): Output dimension for the MLP. | |||
""" | |||
def __init__(self, cell_modality_sizes, drug_modality_sizes, cell_ae_latent_dim, drug_ae_latent_dim, mlp_input_dim, | |||
mlp_output_dim): | |||
super(DeepDRA, self).__init__() | |||
# Initialize cell and drug autoencoders | |||
self.cell_autoencoder = Autoencoder(sum(cell_modality_sizes), cell_ae_latent_dim) | |||
self.drug_autoencoder = Autoencoder(sum(drug_modality_sizes), drug_ae_latent_dim) | |||
# Store modality sizes | |||
self.cell_modality_sizes = cell_modality_sizes | |||
self.drug_modality_sizes = drug_modality_sizes | |||
# Initialize MLP | |||
self.mlp = MLP(mlp_input_dim, mlp_output_dim) | |||
def forward(self, cell_x, drug_x): | |||
""" | |||
Forward pass of the DeepDRA model. | |||
Parameters: | |||
- cell_x (torch.Tensor): Input tensor for cell modality. | |||
- drug_x (torch.Tensor): Input tensor for drug modality. | |||
Returns: | |||
- cell_decoded (torch.Tensor): Decoded tensor for the cell modality. | |||
- drug_decoded (torch.Tensor): Decoded tensor for the drug modality. | |||
- mlp_output (torch.Tensor): Output tensor from the MLP. | |||
""" | |||
# Encode and decode cell modality | |||
cell_encoded = self.cell_autoencoder.encoder(cell_x) | |||
cell_decoded = self.cell_autoencoder.decoder(cell_encoded) | |||
# Encode and decode drug modality | |||
drug_encoded = self.drug_autoencoder.encoder(drug_x) | |||
drug_decoded = self.drug_autoencoder.decoder(drug_encoded) | |||
# Concatenate encoded cell and drug features and pass through MLP | |||
mlp_output = self.mlp(torch.cat((cell_encoded, drug_encoded), 1)) | |||
return cell_decoded, drug_decoded, mlp_output | |||
def compute_l1_loss(self, w): | |||
""" | |||
Computes L1 regularization loss. | |||
Parameters: | |||
- w (torch.Tensor): Input tensor. | |||
Returns: | |||
- loss (torch.Tensor): L1 regularization loss. | |||
""" | |||
return torch.abs(w).sum() | |||
def compute_l2_loss(self, w): | |||
""" | |||
Computes L2 regularization loss. | |||
Parameters: | |||
- w (torch.Tensor): Input tensor. | |||
Returns: | |||
- loss (torch.Tensor): L2 regularization loss. | |||
""" | |||
return torch.square(w).sum() | |||
def train(model, train_loader, num_epochs): | |||
""" | |||
Trains the DeepDRA (Deep Drug Response Anticipation) model. | |||
Parameters: | |||
- model (DeepDRA): The DeepDRA model to be trained. | |||
- train_loader (DataLoader): DataLoader for the training dataset. | |||
- num_epochs (int): Number of training epochs. | |||
""" | |||
autoencoder_loss_fn = nn.MSELoss() | |||
mlp_loss_fn = nn.BCELoss() | |||
mlp_optimizer = optim.Adam(model.parameters(), lr=0.0005) | |||
scheduler = lr_scheduler.ReduceLROnPlateau(mlp_optimizer, mode='min', factor=0.8, patience=5, verbose=True) | |||
for epoch in range(num_epochs): | |||
for batch_idx, (cell_data, drug_data, target) in enumerate(train_loader): | |||
mlp_optimizer.zero_grad() | |||
# Forward pass | |||
cell_decoded_output, drug_decoded_output, mlp_output = model(cell_data, drug_data) | |||
# Compute losses | |||
cell_ae_loss = autoencoder_loss_fn(cell_decoded_output, cell_data) | |||
drug_ae_loss = autoencoder_loss_fn(drug_decoded_output, drug_data) | |||
mlp_loss = mlp_loss_fn(mlp_output, target) | |||
# Total loss is the sum of autoencoder losses and MLP loss | |||
total_loss = drug_ae_loss + cell_ae_loss + mlp_loss | |||
# Backward pass and optimization | |||
total_loss.backward() | |||
mlp_optimizer.step() | |||
# Print progress | |||
if batch_idx % 200 == 0: | |||
print('Epoch [{}/{}], Total Loss: {:.4f}'.format( | |||
epoch + 1, num_epochs, total_loss.item())) | |||
# Learning rate scheduler step | |||
scheduler.step(total_loss) | |||
# Save the trained model | |||
torch.save(model.state_dict(), MODEL_FOLDER + 'DeepDRA.pth') | |||
def test(model, test_loader, reverse=False): | |||
""" | |||
Tests the given model on the test dataset using evaluation metrics. | |||
Parameters: | |||
- model: The trained model to be evaluated. | |||
- test_loader: DataLoader for the test dataset. | |||
- reverse (bool): If True, reverse the predictions for evaluation. | |||
Returns: | |||
- result: The evaluation result based on the chosen metrics. | |||
""" | |||
# Set model to evaluation mode | |||
model.eval() | |||
# Initialize lists to store predictions and ground truth labels | |||
all_predictions = [] | |||
all_labels = [] | |||
# Iterate over the test dataset | |||
for i, (test_cell_loader, test_drug_loader, labels) in enumerate(test_loader): | |||
# Forward pass through the model | |||
with torch.no_grad(): | |||
decoded_cell_output, decoded_drug_output, mlp_output = model(test_cell_loader, test_drug_loader) | |||
# Apply reverse if specified | |||
predictions = 1 - mlp_output if reverse else mlp_output | |||
# # Store predictions and ground truth labels | |||
# all_predictions.extend(predictions.cpu().numpy()) | |||
# all_labels.extend(labels.cpu().numpy()) | |||
# Evaluate the predictions using the specified metrics | |||
result = Evaluation.evaluate(labels, predictions) | |||
return result | |||
@@ -0,0 +1,105 @@ | |||
import torch | |||
import torch.nn as nn | |||
import torch.optim as optim | |||
class Autoencoder(nn.Module): | |||
""" | |||
Autoencoder neural network model for feature learning. | |||
Parameters: | |||
- input_dim (int): Dimensionality of the input features. | |||
- latent_dim (int): Dimensionality of the latent space. | |||
""" | |||
def __init__(self, input_dim, latent_dim): | |||
super(Autoencoder, self).__init__() | |||
# Encoder architecture | |||
self.encoder = nn.Sequential( | |||
nn.Linear(input_dim, 256), | |||
nn.ReLU(inplace=True), | |||
nn.Linear(256, latent_dim), | |||
nn.ReLU(inplace=True), | |||
) | |||
# Decoder architecture | |||
self.decoder = nn.Sequential( | |||
nn.Linear(latent_dim, 256), | |||
nn.ReLU(inplace=True), | |||
nn.Linear(256, input_dim), | |||
) | |||
def forward(self, x): | |||
""" | |||
Forward pass of the autoencoder. | |||
Parameters: | |||
- x (torch.Tensor): Input tensor. | |||
Returns: | |||
- decoded (torch.Tensor): Decoded output tensor. | |||
""" | |||
encoded = self.encoder(x) | |||
decoded = self.decoder(encoded) | |||
return decoded | |||
def trainAutoencoder(model, train_loader, val_loader, num_epochs, name): | |||
""" | |||
Train the autoencoder model. | |||
Parameters: | |||
- model (Autoencoder): The autoencoder model to be trained. | |||
- train_loader (DataLoader): DataLoader for the training dataset. | |||
- val_loader (DataLoader): DataLoader for the validation dataset. | |||
- num_epochs (int): Number of training epochs. | |||
- name (str): Name to save the trained model. | |||
Returns: | |||
- None | |||
""" | |||
loss_fn = nn.MSELoss() | |||
optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-8) | |||
train_loss = [] | |||
val_final_loss = [] | |||
for epoch in range(num_epochs): | |||
# Training | |||
model.train() | |||
total_train_loss = 0.0 | |||
for batch_idx, data in enumerate(train_loader): | |||
data = data[0] | |||
output = model(data) | |||
loss = loss_fn(output, data) | |||
optimizer.zero_grad() | |||
loss.backward() | |||
optimizer.step() | |||
total_train_loss += loss | |||
avg_train_loss = total_train_loss | |||
train_loss.append(avg_train_loss) | |||
# Validation | |||
model.eval() | |||
total_val_loss = 0.0 | |||
with torch.no_grad(): | |||
for val_batch_idx, (val_data) in enumerate(val_loader): | |||
val_data = val_data[0] | |||
val_output = model(val_data) | |||
val_loss = loss_fn(val_output, val_data) | |||
total_val_loss += val_loss | |||
avg_val_loss = total_val_loss | |||
val_final_loss.append(avg_val_loss) | |||
print('Epoch [{}/{}], Train Loss: {:.4f}, Val Loss: {:.4f}'.format( | |||
epoch + 1, num_epochs, avg_train_loss, avg_val_loss)) | |||
before_lr = optimizer.param_groups[0]["lr"] | |||
after_lr = optimizer.param_groups[0]["lr"] | |||
if before_lr != after_lr: | |||
print("Epoch %d: Adam lr %.8f -> %.8f" % (epoch, before_lr, after_lr)) | |||
# Save the trained model | |||
torch.save(model.state_dict(), "autoencoder" + name + '.pth') | |||
print(model.encoder[0].weight.detach().numpy()) |
@@ -0,0 +1,249 @@ | |||
from utils import * | |||
import os | |||
class RawDataLoader: | |||
@staticmethod | |||
def load_data(data_modalities, raw_file_directory, screen_file_directory, sep): | |||
""" | |||
Load raw data and screening data, perform intersection, and adjust screening data. | |||
Parameters: | |||
- data_modalities (list): List of data modalities to load. | |||
- raw_file_directory (str): Directory containing raw data files. | |||
- screen_file_directory (str): Directory containing screening data files. | |||
- sep (str): Separator used in the data files. | |||
Returns: | |||
- data (dict): Dictionary containing loaded raw data. | |||
- drug_screen (pd.DataFrame): Adjusted and intersected screening data. | |||
""" | |||
# Step 1: Load raw data files for specified data modalities | |||
data = RawDataLoader.load_raw_files(intersect=True, data_modalities=data_modalities, | |||
raw_file_directory=raw_file_directory) | |||
# Step 2: Load drug data files for specified data modalities | |||
drug_data = RawDataLoader.load_raw_files(intersect=True, data_modalities=data_modalities, | |||
raw_file_directory=DRUG_DATA_FOLDER) | |||
# Step 3: Update the 'data' dictionary with drug data | |||
data.update(drug_data) | |||
# Step 4: Load and adjust drug screening data | |||
drug_screen = RawDataLoader.load_screening_files( | |||
filename=screen_file_directory, | |||
sep=sep) | |||
drug_screen, data = RawDataLoader.adjust_screening_raw( | |||
drug_screen=drug_screen, data_dict=data) | |||
# Step 5: Return the loaded data and adjusted drug screening data | |||
return data, drug_screen | |||
@staticmethod | |||
def intersect_features(data1, data2): | |||
""" | |||
Perform intersection of features between two datasets. | |||
Parameters: | |||
- data1 (pd.DataFrame): First dataset. | |||
- data2 (pd.DataFrame): Second dataset. | |||
Returns: | |||
- data1 (pd.DataFrame): First dataset with common columns. | |||
- data2 (pd.DataFrame): Second dataset with common columns. | |||
""" | |||
# Step 1: Find common columns between the two datasets | |||
common_columns = list(set(data1.columns) & set(data2.columns)) | |||
# Step 2: Filter data2 to include only common columns | |||
data2 = data2[common_columns] | |||
# Step 3: Filter data1 to include only common columns | |||
data1 = data1[common_columns] | |||
# Step 4: Return the datasets with intersected features | |||
return data1, data2 | |||
@staticmethod | |||
def data_features_intersect(data1, data2): | |||
""" | |||
Intersect features between two datasets column-wise. | |||
Parameters: | |||
- data1 (dict): Dictionary containing data modalities. | |||
- data2 (dict): Dictionary containing data modalities. | |||
Returns: | |||
- intersected_data1 (dict): Data1 with intersected features. | |||
- intersected_data2 (dict): Data2 with intersected features. | |||
""" | |||
# Iterate over each data modality | |||
for i in data1: | |||
# Intersect features for each modality | |||
data1[i], data2[i] = RawDataLoader.intersect_features(data1[i], data2[i]) | |||
return data1, data2 | |||
@staticmethod | |||
def load_file(address, index_column=None): | |||
""" | |||
Load data from a file based on its format. | |||
Parameters: | |||
- address (str): File address. | |||
- index_column (str): Name of the index column. | |||
Returns: | |||
- data (pd.DataFrame): Loaded data from the file. | |||
""" | |||
data = [] | |||
try: | |||
# Load data based on file format | |||
if address.endswith('.txt') or address.endswith('.tsv'): | |||
data.append(pd.read_csv(address, sep='\t', index_col=index_column), ) | |||
elif address.endswith('.csv'): | |||
data.append(pd.read_csv(address)) | |||
elif address.endswith('.xlsx'): | |||
data.append(pd.read_excel(address)) | |||
except FileNotFoundError: | |||
print(f'File not found at address: {address}') | |||
return data[0] | |||
@staticmethod | |||
def load_raw_files(raw_file_directory, data_modalities, intersect=True): | |||
raw_dict = {} | |||
files = os.listdir(raw_file_directory) | |||
cell_line_names = None | |||
drug_names = None | |||
for file in tqdm(files, 'Reading Raw Data Files...'): | |||
if any([file.startswith(x) for x in data_modalities]): | |||
if file.endswith('_raw.gzip'): | |||
df = pd.read_parquet(os.path.join(raw_file_directory, file)) | |||
elif file.endswith('_raw.tsv'): | |||
df = pd.read_csv(os.path.join(raw_file_directory, file), sep='\t', index_col=0) | |||
else: | |||
continue | |||
if df.index.is_numeric(): | |||
df = df.set_index(df.columns[0]) | |||
df = df.sort_index() | |||
df = df.sort_index(axis=1) | |||
df.columns = df.columns.str.replace('_cell_mut', '') | |||
df.columns = df.columns.str.replace('_cell_CN', '') | |||
df.columns = df.columns.str.replace('_cell_exp', '') | |||
# Note that drug_comp raw table has some NA values so we should impute it | |||
if any(df.isna()): | |||
df = pd.DataFrame(SimpleImputer(strategy='mean').fit_transform(df), | |||
columns=df.columns).set_index(df.index) | |||
if file.startswith('drug_comp'): # We need to normalize the drug_data comp dataset | |||
df = ((df - df.mean()) / df.std()).fillna(0) | |||
elif file.startswith('drug_desc'): # We need to normalize the drug_data comp dataset | |||
df = ((df - df.mean()) / df.std()).fillna(0) | |||
if intersect: | |||
if file.startswith('cell'): | |||
if cell_line_names: | |||
cell_line_names = cell_line_names.intersection(set(df.index)) | |||
else: | |||
cell_line_names = set(df.index) | |||
elif file.startswith('drug'): | |||
if drug_names: | |||
drug_names = drug_names.intersection(set(df.index)) | |||
else: | |||
drug_names = set(df.index) | |||
raw_dict[file[:file.find('_raw')]] = df | |||
if intersect: | |||
for key, value in raw_dict.items(): | |||
if key.startswith('cell'): | |||
data = value.loc[list(cell_line_names)] | |||
raw_dict[key] = data.loc[~data.index.duplicated()] | |||
elif key.startswith('drug'): | |||
data = value.loc[list(drug_names)] | |||
raw_dict[key] = data.loc[~data.index.duplicated()] | |||
return raw_dict | |||
@staticmethod | |||
def load_screening_files(filename="AUC_matS_comb.tsv", sep=',', ): | |||
df = pd.read_csv(filename, sep=sep, index_col=0) | |||
# df = df.drop(['Erlotinib','17-AAG','PD-0325901','PHA-665752','PHA-665752','TAE684','Sorafenib','PLX4720','selumetinib','PD-0332991','Paclitaxel','Nilotinib','Saracatinib'],axis=1) | |||
return df | |||
# return pd.read_csv(os.path.join(DATA_FOLDER, "drug_screening_matrix_GDSC.tsv"), sep='\t', index_col=0) | |||
@staticmethod | |||
def adjust_screening_raw(drug_screen, data_dict): | |||
raw_cell_names = [] | |||
for key, value in data_dict.items(): | |||
if 'cell' in key: | |||
if len(raw_cell_names) == 0: | |||
raw_cell_names = value.index | |||
else: | |||
raw_cell_names = raw_cell_names.intersection(value.index) | |||
elif 'drug' in key: | |||
raw_drug_names = value.index | |||
screening_cell_names = drug_screen.index | |||
screening_drug_names = drug_screen.columns | |||
common_cell_names = list(set(raw_cell_names).intersection(set(screening_cell_names))) | |||
common_drug_names = list(set(raw_drug_names).intersection(set(screening_drug_names))) | |||
for key, value in data_dict.items(): | |||
if 'cell' in key: | |||
data_dict[key] = value.loc[common_cell_names] | |||
else: | |||
data_dict[key] = value.loc[common_drug_names] | |||
return drug_screen.loc[common_cell_names, common_drug_names], data_dict | |||
@staticmethod | |||
def prepare_input_data(data_dict, screening): | |||
print('Preparing data...') | |||
resistance = np.argwhere((screening.to_numpy() == 1)).tolist() | |||
resistance.sort(key=lambda x: (x[1], x[0])) | |||
resistance = np.array(resistance) | |||
sensitive = np.argwhere((screening.to_numpy() == -1)).tolist() | |||
sensitive.sort(key=lambda x: (x[1], x[0])) | |||
sensitive = np.array(sensitive) | |||
print("sensitive train data len:", len(sensitive)) | |||
print("resistance train data len:", len(resistance)) | |||
A_train_mask = np.ones(len(resistance), dtype=bool) | |||
B_train_mask = np.ones(len(sensitive), dtype=bool) | |||
resistance = resistance[A_train_mask] | |||
sensitive = sensitive[B_train_mask] | |||
cell_data_types = list(filter(lambda x: x.startswith('cell'), data_dict.keys())) | |||
cell_data_types.sort() | |||
cell_data = pd.concat( | |||
[pd.DataFrame(data_dict[data_type].add_suffix(f'_{data_type}'), dtype=np.float32) for | |||
data_type in cell_data_types], axis=1) | |||
cell_data_sizes = [data_dict[data_type].shape[1] for data_type in cell_data_types] | |||
drug_data_types = list(filter(lambda x: x.startswith('drug'), data_dict.keys())) | |||
drug_data_types.sort() | |||
drug_data = pd.concat( | |||
[pd.DataFrame(data_dict[data_type].add_suffix(f'_{data_type}'), dtype=np.float32, ) | |||
for data_type in drug_data_types], axis=1) | |||
drug_data_sizes = [data_dict[data_type].shape[1] for data_type in drug_data_types] | |||
Xp_cell = cell_data.iloc[resistance[:, 0], :] | |||
Xp_drug = drug_data.iloc[resistance[:, 1], :] | |||
Xp_cell = Xp_cell.reset_index(drop=True) | |||
Xp_drug = Xp_drug.reset_index(drop=True) | |||
Xp_cell.index = [f'({screening.index[x[0]]},{screening.columns[x[1]]})' for x in resistance] | |||
Xp_drug.index = [f'({screening.index[x[0]]},{screening.columns[x[1]]})' for x in resistance] | |||
Xn_cell = cell_data.iloc[sensitive[:, 0], :] | |||
Xn_drug = drug_data.iloc[sensitive[:, 1], :] | |||
Xn_cell = Xn_cell.reset_index(drop=True) | |||
Xn_drug = Xn_drug.reset_index(drop=True) | |||
Xn_cell.index = [f'({screening.index[x[0]]},{screening.columns[x[1]]})' for x in sensitive] | |||
Xn_drug.index = [f'({screening.index[x[0]]},{screening.columns[x[1]]})' for x in sensitive] | |||
X_cell = pd.concat([Xp_cell, Xn_cell]) | |||
X_drug = pd.concat([Xp_drug, Xn_drug]) | |||
Y = np.append(np.zeros(resistance.shape[0]), np.ones(sensitive.shape[0])) | |||
return X_cell, X_drug, Y, cell_data_sizes, drug_data_sizes |
@@ -0,0 +1,130 @@ | |||
from sklearn.metrics import roc_auc_score, average_precision_score, confusion_matrix, f1_score, precision_score, \ | |||
recall_score, accuracy_score | |||
from utils import * | |||
class Evaluation: | |||
@staticmethod | |||
def plot_train_val_accuracy(train_accuracies, val_accuracies, num_epochs): | |||
plt.xlabel('epoch') | |||
plt.ylabel('accuracy') | |||
plt.title('h') | |||
plt.plot(range(1, num_epochs + 1), train_accuracies) | |||
plt.plot(range(1, num_epochs + 1), val_accuracies) | |||
plt.show() | |||
@staticmethod | |||
def plot_train_val_loss(train_loss, val_loss, num_epochs): | |||
plt.xlabel('epoch') | |||
plt.ylabel('loss') | |||
plt.title('h') | |||
plt.plot(range(1, num_epochs + 1), train_loss) | |||
plt.plot(range(1, num_epochs + 1), val_loss) | |||
plt.show() | |||
@staticmethod | |||
def evaluate(all_targets, mlp_output, show_plot=True): | |||
predicted_labels = np.where(mlp_output > 0.5, 1, 0) | |||
# Collect predictions and targets for later evaluation | |||
predicted_labels = predicted_labels.reshape(-1) | |||
# Convert predictions and targets to numpy arrays | |||
all_predictions = predicted_labels | |||
# Calculate and print AUC | |||
fpr, tpr, thresholds = metrics.roc_curve(all_targets, mlp_output) | |||
auc = np.round(metrics.auc(fpr, tpr), 2) | |||
# Calculate and print AUPRC | |||
print(all_targets) | |||
precision, recall, thresholds = metrics.precision_recall_curve(all_targets, mlp_output) | |||
auprc = np.round(metrics.auc(recall, precision), 2) | |||
# auprc = average_precision_score(all_targets, mlp_output) | |||
print('Accuracy: {:.2f}'.format(np.round(accuracy_score(all_targets, all_predictions), 2))) | |||
print('AUC: {:.2f}'.format(auc)) | |||
print('AUPRC: {:.2f}'.format(auprc)) | |||
# Calculate and print confusion matrix | |||
cm = confusion_matrix(all_targets, all_predictions) | |||
accuracy = cm.trace() / np.sum(cm) | |||
precision = cm[0, 0] / (cm[0, 0] + cm[0, 1]) | |||
recall = cm[0, 0] / (cm[0, 0] + cm[1, 0]) | |||
f1_score = 2 * precision * recall / (precision + recall) | |||
print('Confusion matrix:\n', cm, sep='') | |||
print(f'Accuracy: {accuracy:.3f}, Precision: {precision:.3f}, Recall: {recall:.3f}, F1 score: {f1_score:.3f}') | |||
if show_plot: | |||
plt.xlabel('False Positive Rate') | |||
plt.ylabel('True Positive Rate') | |||
plt.title(f'ROC Curve: AUC={auc}') | |||
plt.plot(fpr, tpr) | |||
plt.show() | |||
# print(f'AUC: {auc}') | |||
plt.xlabel('Recall') | |||
plt.ylabel('Precision') | |||
plt.title(f'PR Curve: AUPRC={auprc}') | |||
plt.plot(recall, precision) | |||
plt.show() | |||
prediction_targets = pd.DataFrame({}, columns=['Prediction', 'Target']) | |||
res = pd.concat( | |||
[pd.DataFrame(mlp_output.numpy(), ), pd.DataFrame(all_targets.numpy())], axis=1, | |||
ignore_index=True) | |||
res.columns = prediction_targets.columns | |||
prediction_targets = pd.concat([prediction_targets, res]) | |||
class_one = prediction_targets.loc[prediction_targets['Target'] == 0, 'Prediction'].astype( | |||
np.float32).tolist() | |||
class_minus_one = prediction_targets.loc[prediction_targets['Target'] == 1, 'Prediction'].astype( | |||
np.float32).tolist() | |||
fig, ax = plt.subplots() | |||
ax.set_ylabel("DeepDRA score") | |||
xticklabels = ['Responder', 'Non Responder'] | |||
ax.set_xticks([1, 2]) | |||
ax.set_xticklabels(xticklabels) | |||
data_to_plot = [class_minus_one, class_one] | |||
plt.ylim(0, 1) | |||
p_value = np.format_float_scientific(ttest_ind(class_one, class_minus_one)[1]) | |||
cancer = 'all' | |||
plt.title( | |||
f'Responder/Non responder scores for {cancer} cancer with \np-value ~= {p_value[0]}e{p_value[-3:]} ') | |||
bp = ax.violinplot(data_to_plot, showextrema=True, showmeans=True, showmedians=True) | |||
bp['cmeans'].set_color('r') | |||
bp['cmedians'].set_color('g') | |||
plt.show() | |||
return {'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1 score': f1_score, 'AUC': auc, | |||
'AUPRC': auprc} | |||
@staticmethod | |||
def add_results(result_list, current_result): | |||
result_list['AUC'].append(current_result['AUC']) | |||
result_list['AUPRC'].append(current_result['AUPRC']) | |||
result_list['Accuracy'].append(current_result['Accuracy']) | |||
result_list['Precision'].append(current_result['Precision']) | |||
result_list['Recall'].append(current_result['Recall']) | |||
result_list['F1 score'].append(current_result['F1 score']) | |||
return result_list | |||
@staticmethod | |||
def show_final_results(result_list): | |||
print("Final Results:") | |||
for i in range(len(result_list["AUC"])): | |||
accuracy = result_list['Accuracy'][i] | |||
precision = result_list['Precision'][i] | |||
recall = result_list['Recall'][i] | |||
f1_score = result_list['F1 score'][i] | |||
auc = result_list['AUC'][i] | |||
auprc = result_list['AUPRC'][i] | |||
print(f'Accuracy: {accuracy:.3f}, Precision: {precision:.3f}, Recall: {recall:.3f}, F1 score: {f1_score:.3f}, AUC: {auc:.3f}, ,AUPRC: {auprc:.3f}') | |||
avg_auc = np.mean(result_list['AUC']) | |||
avg_auprc = np.mean(result_list['AUPRC']) | |||
std_auprc = np.std(result_list['AUPRC']) | |||
print(" Average AUC: {:.3f} \t Average AUPRC: {:.3f} \t Std AUPRC: {:.3f}".format(avg_auc, avg_auprc, std_auprc)) |
@@ -0,0 +1,159 @@ | |||
from imblearn.under_sampling import RandomUnderSampler | |||
from sklearn.model_selection import train_test_split | |||
from torch.utils.data import TensorDataset, DataLoader, SubsetRandomSampler | |||
from DeepDRA import DeepDRA, train, test | |||
from data_loader import RawDataLoader | |||
from evaluation import Evaluation | |||
from utils import * | |||
from mlp import MLP | |||
import random | |||
import torch | |||
import numpy as np | |||
def train_DeepDRA(x_cell_train, x_cell_test, x_drug_train, x_drug_test, y_train, y_test, cell_sizes, drug_sizes): | |||
""" | |||
Train and evaluate the DeepDRA model. | |||
Parameters: | |||
- X_cell_train (pd.DataFrame): Training data for the cell modality. | |||
- X_cell_test (pd.DataFrame): Test data for the cell modality. | |||
- X_drug_train (pd.DataFrame): Training data for the drug modality. | |||
- X_drug_test (pd.DataFrame): Test data for the drug modality. | |||
- y_train (pd.Series): Training labels. | |||
- y_test (pd.Series): Test labels. | |||
- cell_sizes (list): Sizes of the cell modality features. | |||
- drug_sizes (list): Sizes of the drug modality features. | |||
Returns: | |||
- result: Evaluation result on the test set. | |||
""" | |||
# Step 1: Define the batch size for training | |||
batch_size = 64 | |||
# Step 2: Instantiate the combined model | |||
ae_latent_dim = 50 | |||
mlp_input_dim = 2 * ae_latent_dim | |||
mlp_output_dim = 1 | |||
num_epochs = 20 | |||
model = DeepDRA(cell_sizes, drug_sizes, ae_latent_dim, ae_latent_dim, mlp_input_dim, mlp_output_dim) | |||
# Step 3: Convert your training data to PyTorch tensors | |||
x_cell_train_tensor = torch.Tensor(x_cell_train.values) | |||
x_drug_train_tensor = torch.Tensor(x_drug_train.values) | |||
x_cell_train_tensor = torch.nn.functional.normalize(x_cell_train_tensor, dim=0) | |||
x_drug_train_tensor = torch.nn.functional.normalize(x_drug_train_tensor, dim=0) | |||
y_train_tensor = torch.Tensor(y_train) | |||
y_train_tensor = y_train_tensor.unsqueeze(1) | |||
# Step 4: Create a TensorDataset with the input features and target labels | |||
train_dataset = TensorDataset(x_cell_train_tensor, x_drug_train_tensor, y_train_tensor) | |||
# Step 5: Create the train_loader | |||
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) | |||
# Step 6: Train the model | |||
train(model, train_loader, num_epochs=num_epochs) | |||
# Step 7: Save the trained model | |||
torch.save(model, 'DeepDRA.pth') | |||
# Step 8: Load the saved model | |||
model = torch.load('DeepDRA.pth') | |||
# Step 9: Convert your test data to PyTorch tensors | |||
x_cell_test_tensor = torch.Tensor(x_cell_test.values) | |||
x_drug_test_tensor = torch.Tensor(x_drug_test.values) | |||
y_test_tensor = torch.Tensor(y_test) | |||
# normalize data | |||
x_cell_test_tensor = torch.nn.functional.normalize(x_cell_test_tensor, dim=0) | |||
x_drug_test_tensor = torch.nn.functional.normalize(x_drug_test_tensor, dim=0) | |||
# Step 10: Create a TensorDataset with the input features and target labels for testing | |||
test_dataset = TensorDataset(x_cell_test_tensor, x_drug_test_tensor, y_test_tensor) | |||
test_loader = DataLoader(test_dataset, batch_size=len(x_cell_test)) | |||
# Step 11: Test the model | |||
return test(model, test_loader) | |||
def run(k, is_test=False): | |||
""" | |||
Run the training and evaluation process k times. | |||
Parameters: | |||
- k (int): Number of times to run the process. | |||
- is_test (bool): If True, run on test data; otherwise, perform train-validation split. | |||
Returns: | |||
- history (dict): Dictionary containing evaluation metrics for each run. | |||
""" | |||
# Step 1: Initialize a dictionary to store evaluation metrics | |||
history = {'AUC': [], 'AUPRC': [], "Accuracy": [], "Precision": [], "Recall": [], "F1 score": []} | |||
# Step 2: Load training data | |||
train_data, train_drug_screen = RawDataLoader.load_data(data_modalities=DATA_MODALITIES, | |||
raw_file_directory=GDSC_RAW_DATA_FOLDER, | |||
screen_file_directory=GDSC_SCREENING_DATA_FOLDER, | |||
sep="\t") | |||
# Step 3: Load test data if applicable | |||
if is_test: | |||
test_data, test_drug_screen = RawDataLoader.load_data(data_modalities=DATA_MODALITIES, | |||
raw_file_directory=CCLE_RAW_DATA_FOLDER, | |||
screen_file_directory=CCLE_SCREENING_DATA_FOLDER, | |||
sep="\t") | |||
train_data, test_data = RawDataLoader.data_features_intersect(train_data, test_data) | |||
X_cell_test, X_drug_test, y_test, cell_sizes, drug_sizes = RawDataLoader.prepare_input_data(test_data, | |||
test_drug_screen) | |||
# Step 4: Prepare input data for training | |||
X_cell_train, X_drug_train, y_train, cell_sizes, drug_sizes = RawDataLoader.prepare_input_data(train_data, | |||
train_drug_screen) | |||
# Step 5: Loop over k runs | |||
for i in range(k): | |||
print('Run {}'.format(i)) | |||
# Step 6: If is_test is True, perform random under-sampling on the training data | |||
if is_test: | |||
rus = RandomUnderSampler(sampling_strategy="majority", random_state=RANDOM_SEED) | |||
dataset = pd.concat([X_cell_train, X_drug_train], axis=1) | |||
dataset.index = X_cell_train.index | |||
dataset, y_train = rus.fit_resample(dataset, y_train) | |||
X_cell_train = dataset.iloc[:, :sum(cell_sizes)] | |||
X_drug_train = dataset.iloc[:, sum(cell_sizes):] | |||
# Step 7: Train and evaluate the DeepDRA model on test data | |||
results = train_DeepDRA(X_cell_train, X_cell_test, X_drug_train, X_drug_test, y_train, y_test, cell_sizes, | |||
drug_sizes) | |||
else: | |||
# Step 8: Split the data into training and validation sets | |||
X_cell_train, X_cell_test, X_drug_train, X_drug_test, y_train, y_test = train_test_split(X_cell_train, | |||
X_drug_train, y_train, | |||
test_size=0.2, | |||
random_state=44, | |||
shuffle=True) | |||
# Step 9: Train and evaluate the DeepDRA model on the split data | |||
results = train_DeepDRA(X_cell_train, X_cell_test, X_drug_train, X_drug_test, y_train, y_test, cell_sizes, | |||
drug_sizes) | |||
# Step 10: Add results to the history dictionary | |||
Evaluation.add_results(history, results) | |||
# Step 11: Display final results | |||
Evaluation.show_final_results(history) | |||
return history | |||
if __name__ == '__main__': | |||
torch.manual_seed(RANDOM_SEED) | |||
random.seed(RANDOM_SEED) | |||
np.random.seed(RANDOM_SEED) | |||
run(10, is_test=True) |
@@ -0,0 +1,128 @@ | |||
from torch import nn | |||
from torch import optim, no_grad | |||
import torch | |||
from evaluation import Evaluation | |||
from utils import data_modalities_abbreviation | |||
class EarlyStopper: | |||
def __init__(self, patience=1, min_delta=0): | |||
self.patience = patience | |||
self.min_delta = min_delta | |||
self.counter = 0 | |||
self.min_validation_loss = float('inf') | |||
def early_stop(self, validation_loss): | |||
if validation_loss < self.min_validation_loss: | |||
self.min_validation_loss = validation_loss | |||
self.counter = 0 | |||
elif validation_loss > (self.min_validation_loss + self.min_delta): | |||
self.counter += 1 | |||
if self.counter >= self.patience: | |||
return True | |||
return False | |||
class MLP(nn.Module): | |||
def __init__(self, input_dim, output_dim): | |||
super(MLP, self).__init__() | |||
self.mlp = nn.Sequential( | |||
nn.Linear(input_dim, 128), | |||
nn.ReLU(inplace=True), | |||
nn.Linear(128, output_dim), | |||
nn.Hardsigmoid(), | |||
) | |||
def forward(self, x): | |||
return self.mlp(x) | |||
def train_mlp(model, train_loader, val_loader, num_epochs): | |||
mlp_loss_fn = nn.BCELoss() | |||
mlp_optimizer = optim.Adadelta(model.parameters(), lr=0.01,) | |||
# scheduler = lr_scheduler.ReduceLROnPlateau(mlp_optimizer, mode='min', factor=0.8, patience=5, verbose=True) | |||
train_accuracies = [] | |||
val_accuracies = [] | |||
train_loss = [] | |||
val_loss = [] | |||
early_stopper = EarlyStopper(patience=3, min_delta=0.05) | |||
for epoch in range(num_epochs): | |||
# Training | |||
model.trainCombinedModel() | |||
total_train_loss = 0.0 | |||
train_correct = 0 | |||
train_total_samples = 0 | |||
for batch_idx, (data, target) in enumerate(train_loader): | |||
mlp_optimizer.zero_grad() | |||
mlp_output = model(data) | |||
mlp_loss = mlp_loss_fn(mlp_output, target) | |||
mlp_loss.backward() | |||
mlp_optimizer.step() | |||
total_train_loss += mlp_loss.item() | |||
# Calculate accuracy | |||
train_predictions = torch.round(mlp_output) | |||
train_correct += (train_predictions == target).sum().item() | |||
train_total_samples += target.size(0) | |||
# if batch_idx % 200 == 0: | |||
# after_lr = mlp_optimizer.param_groups[0]["lr"] | |||
# print('Epoch [{}/{}], Batch [{}/{}], Total Loss: {:.4f}, Learning Rate: {:.8f},'.format( | |||
# epoch + 1, num_epochs, batch_idx + 1, len(train_loader), mlp_loss.item(), after_lr)) | |||
avg_train_loss = total_train_loss / len(train_loader) | |||
train_loss.append(avg_train_loss) | |||
# Validation | |||
model.eval() | |||
total_val_loss = 0.0 | |||
correct = 0 | |||
total_samples = 0 | |||
with torch.no_grad(): | |||
for val_batch_idx, (data, val_target) in enumerate(val_loader): | |||
val_mlp_output = model(data) | |||
val_mlp_loss = mlp_loss_fn(val_mlp_output, val_target) | |||
total_val_loss += val_mlp_loss.item() | |||
# Calculate accuracy | |||
val_predictions = torch.round(val_mlp_output) | |||
correct += (val_predictions == val_target).sum().item() | |||
total_samples += val_target.size(0) | |||
avg_val_loss = total_val_loss / len(val_loader) | |||
val_loss.append(avg_val_loss) | |||
train_accuracy = train_correct / train_total_samples | |||
train_accuracies.append(train_accuracy) | |||
val_accuracy = correct / total_samples | |||
val_accuracies.append(val_accuracy) | |||
print( | |||
'Epoch [{}/{}], Train Loss: {:.4f}, Val Loss: {:.4f}, Train Accuracy: {:.4f}, Val Accuracy: {:.4f}'.format( | |||
epoch + 1, num_epochs, avg_train_loss, avg_val_loss, train_accuracy, | |||
val_accuracy)) | |||
# if early_stopper.early_stop(avg_val_loss): | |||
# break | |||
before_lr = mlp_optimizer.param_groups[0]["lr"] | |||
# scheduler.step(avg_val_loss) | |||
after_lr = mlp_optimizer.param_groups[0]["lr"] | |||
if before_lr != after_lr: | |||
print("Epoch %d: Adam lr %.8f -> %.8f" % (epoch, before_lr, after_lr)) | |||
Evaluation.plot_train_val_accuracy(train_accuracies, val_accuracies, epoch+1) | |||
Evaluation.plot_train_val_loss(train_loss, val_loss, epoch+1) | |||
def test_mlp(model, test_loader): | |||
for i, (data, labels) in enumerate(test_loader): | |||
model.eval() | |||
with torch.no_grad(): | |||
mlp_output = model(data) | |||
return Evaluation.evaluate(labels, mlp_output) | |||
@@ -0,0 +1,116 @@ | |||
import os | |||
import pandas as pd | |||
import numpy as np | |||
from tqdm import tqdm | |||
import sklearn as sk | |||
from matplotlib import pyplot as plt | |||
from scipy.spatial.distance import pdist, squareform | |||
import h2o | |||
from h2o.estimators import H2ODeepLearningEstimator | |||
from sklearn.impute import SimpleImputer | |||
import torch | |||
import torch.nn as nn | |||
import torch.optim as optim | |||
from torch.utils.data import DataLoader, TensorDataset | |||
import pickle | |||
from sklearn import metrics | |||
from copy import deepcopy | |||
import pyreadr | |||
import requests | |||
from time import time | |||
from math import ceil | |||
from statsmodels.stats.weightstats import ttest_ind | |||
import torch.optim.lr_scheduler as lr_scheduler | |||
from sklearn.model_selection import KFold | |||
DATA_FOLDER = 'data' | |||
RES_DATA_FOLDER = os.path.join(DATA_FOLDER, 'res') | |||
TEST_DATA_FOLDER = os.path.join(DATA_FOLDER, 'final_test_data') | |||
TEST_TCGA_DATA_FOLDER = os.path.join(DATA_FOLDER, 'TCGA_test_data') | |||
SIM_DATA_FOLDER = os.path.join(DATA_FOLDER, 'similarity_data') | |||
RAW_DATA_FOLDER = os.path.join(DATA_FOLDER, 'raw_data') | |||
RAW_BOTH_DATA_FOLDER = os.path.join(DATA_FOLDER, 'CTRP_GDSC_Data') | |||
DRUG_DATA_FOLDER = os.path.join(DATA_FOLDER, 'drug_data') | |||
NEW_RAW_DATA_FOLDER = os.path.join(DATA_FOLDER, 'new_raw_data') | |||
GDSC_RAW_DATA_FOLDER = os.path.join(DATA_FOLDER, 'GDSC_data') | |||
CCLE_RAW_DATA_FOLDER = os.path.join(DATA_FOLDER, 'CCLE_raw') | |||
CTRP_FOLDER = os.path.join(DATA_FOLDER, 'CTRP') | |||
GDSC_FOLDER = os.path.join(DATA_FOLDER, 'GDSC') | |||
CCLE_FOLDER = os.path.join(DATA_FOLDER, 'CCLE') | |||
MODEL_FOLDER = os.path.join(DATA_FOLDER, 'model') | |||
CTRP_EXPERIMENT_FILE = os.path.join(CTRP_FOLDER, 'v20.meta.per_experiment.txt') | |||
CTRP_COMPOUND_FILE = os.path.join(CTRP_FOLDER, 'v20.meta.per_compound.txt') | |||
CTRP_CELLLINE_FILE = os.path.join(CTRP_FOLDER, 'v20.meta.per_cell_line.txt') | |||
CTRP_AUC_FILE = os.path.join(CTRP_FOLDER, 'v20.data.curves_post_qc.txt') | |||
GDSC_AUC_FILE = os.path.join(GDSC_FOLDER, 'GDSC2_fitted_dose_response.csv') | |||
GDSC_cnv_data_FILE = os.path.join(GDSC_FOLDER, 'cnv_abs_copy_number_picnic_20191101.csv') | |||
GDSC_methy_data_FILE = os.path.join(GDSC_FOLDER, 'F2_METH_CELL_DATA.txt') | |||
GDSC_methy_sampleIds_FILE = os.path.join(GDSC_FOLDER, 'methSampleId_2_cosmicIds.xlsx') | |||
GDSC_exp_data_FILE = os.path.join(GDSC_FOLDER, 'Cell_line_RMA_proc_basalExp.txt') | |||
GDSC_exp_sampleIds_FILE = os.path.join(GDSC_FOLDER, 'E-MTAB-3610.sdrf.txt') | |||
GDSC_mut_data_FILE = os.path.join(GDSC_FOLDER, 'mutations_all_20230202.csv') | |||
GDSC_SCREENING_DATA_FOLDER = os.path.join(GDSC_RAW_DATA_FOLDER, 'drug_screening_matrix_GDSC.tsv') | |||
CCLE_SCREENING_DATA_FOLDER = os.path.join(CCLE_RAW_DATA_FOLDER, 'drug_screening_matrix_ccle.tsv') | |||
BOTH_SCREENING_DATA_FOLDER = os.path.join(RAW_BOTH_DATA_FOLDER, 'drug_screening_matrix_gdsc_ctrp.tsv') | |||
CCLE_mut_data_FILE = os.path.join(CCLE_FOLDER, 'CCLE_mutations.csv') | |||
TABLE_RESULTS_FILE = os.path.join(DATA_FOLDER, 'drug_screening_table.tsv') | |||
MATRIX_RESULTS_FILE = os.path.join(DATA_FOLDER, 'drug_screening_matrix.tsv') | |||
MODEL_FILE = os.path.join(MODEL_FOLDER, 'trained_model_V1_EMDP.sav') | |||
TEST_FILE = os.path.join(TEST_DATA_FOLDER, 'test.gzip') | |||
RESULT_FILE = os.path.join(RES_DATA_FOLDER, 'result.tsv') | |||
TCGA_DATA_FOLDER = os.path.join(DATA_FOLDER, 'TCGA_test_data') | |||
TCGA_SCREENING_DATA = os.path.join(TCGA_DATA_FOLDER, 'TCGA_screening_matrix.tsv') | |||
BUILD_SIM_MATRICES = True # Make this variable True to build similarity matrices from raw data | |||
SIM_KERNEL = {'cell_CN': ('euclidean', 0.001), 'cell_exp': ('euclidean', 0.01), 'cell_methy': ('euclidean', 0.1), | |||
'cell_mut': ('jaccard', 1), 'drug_DT': ('jaccard', 1), 'drug_comp': ('euclidean', 0.001), | |||
'drug_desc': ('euclidean', 0.001), 'drug_finger': ('euclidean', 0.001)} | |||
SAVE_MODEL = False # Change it to True to save the trained model | |||
VARIATIONAL_AUTOENCODERS = False | |||
# DATA_MODALITIES=['cell_CN','cell_exp','cell_methy','cell_mut','drug_comp','drug_DT'] # Change this list to only consider specific data modalities | |||
DATA_MODALITIES = ['cell_mut', 'drug_desc', 'drug_finger'] | |||
RANDOM_SEED = 42 # Must be used wherever can be used | |||
def data_modalities_abbreviation(): | |||
abb = [] | |||
if 'cell_CN' in DATA_MODALITIES: | |||
abb.append('C') | |||
if 'cell_exp' in DATA_MODALITIES: | |||
abb.append('E') | |||
if 'cell_mut' in DATA_MODALITIES: | |||
abb.append('M') | |||
if 'cell_methy' in DATA_MODALITIES: | |||
abb.append('T') | |||
if 'drug_DT' in DATA_MODALITIES: | |||
abb.append('D') | |||
if 'drug_comp' in DATA_MODALITIES: | |||
abb.append('P') | |||
return ''.join(abb) | |||
""" TRAIN_INTEGRATION_METHOD used for each cell's and drug_data's data definitions: | |||
SIMILARITY: A kernel based integration method in which based on the similarity of each cell's data with the training cell's | |||
data the input features for the multi layer perceptron (MLP) is constructed. The similarity function used could be different for | |||
each data modality (euclidean, jaccard,l1_norm, or ...) | |||
AUTO_ENCODER_V1: In this version of integrating multi-omics, for each data modality an autoencoder is trained to reduce the | |||
dimension of the features and finally a concatenation of each autoencoder's latent space builds up the input layer of the MLP. | |||
AUTO_ENCODER_V2: In this version of integrating multi-omics data, we train a big autoencoder which reduces the dimension of | |||
all the different data modalities features at the same time to a smaller feature space. This version of integrating could | |||
take a lot of memory and time to integrate the data and might be computationally expensive. | |||
AUTO_ENCODER_V3: IN this version of integrating multi-omics data, we train an autoencoder for all the modalities kinda same as | |||
the autoencoder version 2 but with this difference that the encoder and decoder layers are separate from each other and | |||
just the latent layer is shared among different data modalities. | |||
""" |