Browse Source

Merge branch 'main' of https://github.com/tahamv/DeepDRA

# Conflicts:
#	main.py
main
taha 11 months ago
parent
commit
448bf54290
6 changed files with 95 additions and 41 deletions
  1. 4
    3
      DeepDRA.py
  2. 7
    10
      data_loader.py
  3. 3
    1
      evaluation.py
  4. 78
    25
      main.py
  5. 1
    1
      mlp.py
  6. 2
    1
      utils.py

+ 4
- 3
DeepDRA.py View File

- mlp_output_dim (int): Output dimension for the MLP. - mlp_output_dim (int): Output dimension for the MLP.
""" """


def __init__(self, cell_modality_sizes, drug_modality_sizes, cell_ae_latent_dim, drug_ae_latent_dim, mlp_input_dim,
mlp_output_dim):
def __init__(self, cell_modality_sizes, drug_modality_sizes, cell_ae_latent_dim, drug_ae_latent_dim):
super(DeepDRA, self).__init__() super(DeepDRA, self).__init__()


# Initialize cell and drug autoencoders # Initialize cell and drug autoencoders
self.drug_modality_sizes = drug_modality_sizes self.drug_modality_sizes = drug_modality_sizes


# Initialize MLP # Initialize MLP
self.mlp = MLP(mlp_input_dim, mlp_output_dim)
self.mlp = MLP(cell_ae_latent_dim+drug_ae_latent_dim, 1)


def forward(self, cell_x, drug_x): def forward(self, cell_x, drug_x):
""" """


# Backward pass and optimization # Backward pass and optimization
total_loss.backward() total_loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1)

mlp_optimizer.step() mlp_optimizer.step()
total_train_loss += total_loss.item() total_train_loss += total_loss.item()



+ 7
- 10
data_loader.py View File



class RawDataLoader: class RawDataLoader:
@staticmethod @staticmethod
def load_data(data_modalities, raw_file_directory, screen_file_directory, sep):
def load_data(data_modalities, raw_file_directory, screen_file_directory, sep, drug_directory=DRUG_DATA_FOLDER):
""" """
Load raw data and screening data, perform intersection, and adjust screening data. Load raw data and screening data, perform intersection, and adjust screening data.




# Step 2: Load drug data files for specified data modalities # Step 2: Load drug data files for specified data modalities
drug_data = RawDataLoader.load_raw_files(intersect=True, data_modalities=data_modalities, drug_data = RawDataLoader.load_raw_files(intersect=True, data_modalities=data_modalities,
raw_file_directory=DRUG_DATA_FOLDER)
raw_file_directory=drug_directory)


# Step 3: Update the 'data' dictionary with drug data # Step 3: Update the 'data' dictionary with drug data
data.update(drug_data) data.update(drug_data)
df.columns = df.columns.str.replace('_cell_CN', '') df.columns = df.columns.str.replace('_cell_CN', '')
df.columns = df.columns.str.replace('_cell_exp', '') df.columns = df.columns.str.replace('_cell_exp', '')


# Note that drug_comp raw table has some NA values so we should impute it
if any(df.isna()):
df = pd.DataFrame(SimpleImputer(strategy='mean').fit_transform(df),
columns=df.columns).set_index(df.index)
if file.startswith('drug_comp'): # We need to normalize the drug_data comp dataset
df = ((df - df.mean()) / df.std()).fillna(0)
elif file.startswith('drug_desc'): # We need to normalize the drug_data comp dataset
df = ((df - df.mean()) / df.std()).fillna(0)
df = (df - df.min()) / (df.max() - df.min())
df = df.fillna(0)

print("has null:")
print(df.isnull().sum().sum())
if intersect: if intersect:
if file.startswith('cell'): if file.startswith('cell'):
if cell_line_names: if cell_line_names:

+ 3
- 1
evaluation.py View File





@staticmethod @staticmethod
def evaluate(all_targets, mlp_output, show_plot=True):
def evaluate(all_targets, mlp_output, show_plot=False):
""" """
Evaluate model performance based on predictions and targets. Evaluate model performance based on predictions and targets.


f'AVG: Accuracy: {avg_accuracy:.3f}, Precision: {avg_precision:.3f}, Recall: {avg_recal:.3f}, F1 score: {avg_f1score:.3f}, AUC: {avg_auc:.3f}, ,AUPRC: {avg_auprc:.3f}') f'AVG: Accuracy: {avg_accuracy:.3f}, Precision: {avg_precision:.3f}, Recall: {avg_recal:.3f}, F1 score: {avg_f1score:.3f}, AUC: {avg_auc:.3f}, ,AUPRC: {avg_auprc:.3f}')


print(" Average AUC: {:.3f} \t Average AUPRC: {:.3f} \t Std AUPRC: {:.3f}".format(avg_auc, avg_auprc, std_auprc)) print(" Average AUC: {:.3f} \t Average AUPRC: {:.3f} \t Std AUPRC: {:.3f}".format(avg_auc, avg_auprc, std_auprc))
return {'Accuracy': avg_accuracy, 'Precision': avg_precision, 'Recall': avg_recal, 'F1 score': avg_f1score, 'AUC': avg_auc,
'AUPRC': avg_auprc}

+ 78
- 25
main.py View File

from sklearn.utils.class_weight import compute_class_weight from sklearn.utils.class_weight import compute_class_weight


from torch.utils.data import TensorDataset, DataLoader, SubsetRandomSampler from torch.utils.data import TensorDataset, DataLoader, SubsetRandomSampler
from sklearn.model_selection import KFold


from DeepDRA import DeepDRA, train, test from DeepDRA import DeepDRA, train, test
from data_loader import RawDataLoader from data_loader import RawDataLoader
import numpy as np import numpy as np
import pandas as pd import pandas as pd


# Step 1: Define the batch size for training
batch_size = 64

# Step 2: Instantiate the combined model
ae_latent_dim = 50
num_epochs = 25


def train_DeepDRA(x_cell_train, x_cell_test, x_drug_train, x_drug_test, y_train, y_test, cell_sizes, drug_sizes,device): def train_DeepDRA(x_cell_train, x_cell_test, x_drug_train, x_drug_test, y_train, y_test, cell_sizes, drug_sizes,device):
""" """
- result: Evaluation result on the test set. - result: Evaluation result on the test set.
""" """


# Step 1: Define the batch size for training
batch_size = 64


# Step 2: Instantiate the combined model
ae_latent_dim = 50
mlp_input_dim = 2 * ae_latent_dim
mlp_output_dim = 1
num_epochs = 25
model = DeepDRA(cell_sizes, drug_sizes, ae_latent_dim, ae_latent_dim, mlp_input_dim, mlp_output_dim) model = DeepDRA(cell_sizes, drug_sizes, ae_latent_dim, ae_latent_dim, mlp_input_dim, mlp_output_dim)
model= model.to(device) model= model.to(device)

# Step 3: Convert your training data to PyTorch tensors # Step 3: Convert your training data to PyTorch tensors
x_cell_train_tensor = torch.Tensor(x_cell_train.values) x_cell_train_tensor = torch.Tensor(x_cell_train.values)
x_drug_train_tensor = torch.Tensor(x_drug_train.values) x_drug_train_tensor = torch.Tensor(x_drug_train.values)
# Step 11: Test the model # Step 11: Test the model
return test(model, test_loader) return test(model, test_loader)


def cv_train(x_cell_train, x_drug_train, y_train, cell_sizes,
drug_sizes, device, k=5, ):


splits = KFold(n_splits=k, shuffle=True, random_state=RANDOM_SEED)
history = {'AUC': [], 'AUPRC': [], "Accuracy": [], "Precision": [], "Recall": [], "F1 score": []}

for fold, (train_idx, val_idx) in enumerate(splits.split(np.arange(len(x_cell_train)))):
print('Fold {}'.format(fold + 1))

train_sampler = SubsetRandomSampler(train_idx)
test_sampler = SubsetRandomSampler(val_idx)
model = DeepDRA(cell_sizes, drug_sizes, ae_latent_dim, ae_latent_dim)
# Convert your training data to PyTorch tensors
x_cell_train_tensor = torch.Tensor(x_cell_train.values)
x_drug_train_tensor = torch.Tensor(x_drug_train.values)

y_train_tensor = torch.Tensor(y_train)
y_train_tensor = y_train_tensor.unsqueeze(1)

# Compute class weights
classes = [0, 1] # Assuming binary classification
class_weights = torch.tensor(compute_class_weight(class_weight='balanced', classes=classes, y=y_train),
dtype=torch.float32)

# Create a TensorDataset with the input features and target labels
train_dataset = TensorDataset(x_cell_train_tensor, x_drug_train_tensor, y_train_tensor)

# Create the train_loader
train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=train_sampler)
# Train the model
train(model, train_loader,train_loader, num_epochs, class_weights)


# Create a TensorDataset with the input features and target labels
test_loader = DataLoader(train_dataset, batch_size=len(x_cell_train), sampler=test_sampler)

# Test the model
results = test(model, test_loader)


def run(k, is_test=False):
# Step 10: Add results to the history dictionary
Evaluation.add_results(history, results)


return Evaluation.show_final_results(history)

def run(k, is_test=False ):
""" """
Run the training and evaluation process k times. Run the training and evaluation process k times.


screen_file_directory=BOTH_SCREENING_DATA_FOLDER, screen_file_directory=BOTH_SCREENING_DATA_FOLDER,
sep="\t") sep="\t")



# Step 3: Load test data if applicable # Step 3: Load test data if applicable
if is_test: if is_test:
test_data, test_drug_screen = RawDataLoader.load_data(data_modalities=DATA_MODALITIES, test_data, test_drug_screen = RawDataLoader.load_data(data_modalities=DATA_MODALITIES,
screen_file_directory=CCLE_SCREENING_DATA_FOLDER, screen_file_directory=CCLE_SCREENING_DATA_FOLDER,
sep="\t") sep="\t")
train_data, test_data = RawDataLoader.data_features_intersect(train_data, test_data) train_data, test_data = RawDataLoader.data_features_intersect(train_data, test_data)
X_cell_test, X_drug_test, y_test, cell_sizes, drug_sizes = RawDataLoader.prepare_input_data(test_data,
test_drug_screen)



# Step 4: Prepare input data for training # Step 4: Prepare input data for training
X_cell_train, X_drug_train, y_train, cell_sizes, drug_sizes = RawDataLoader.prepare_input_data(train_data,
x_cell_train, x_drug_train, y_train, cell_sizes, drug_sizes = RawDataLoader.prepare_input_data(train_data,
train_drug_screen) train_drug_screen)


if is_test:
x_cell_test, x_drug_test, y_test, cell_sizes, drug_sizes = RawDataLoader.prepare_input_data(test_data,
test_drug_screen)

rus = RandomUnderSampler(sampling_strategy="majority", random_state=RANDOM_SEED) rus = RandomUnderSampler(sampling_strategy="majority", random_state=RANDOM_SEED)
dataset = pd.concat([X_cell_train, X_drug_train], axis=1)
dataset.index = X_cell_train.index
dataset = pd.concat([x_cell_train, x_drug_train], axis=1)
dataset.index = x_cell_train.index
dataset, y_train = rus.fit_resample(dataset, y_train) dataset, y_train = rus.fit_resample(dataset, y_train)
X_cell_train = dataset.iloc[:, :sum(cell_sizes)]
X_drug_train = dataset.iloc[:, sum(cell_sizes):]
x_cell_train = dataset.iloc[:, :sum(cell_sizes)]
x_drug_train = dataset.iloc[:, sum(cell_sizes):]


# Step 5: Loop over k runs # Step 5: Loop over k runs
for i in range(k): for i in range(k):
if is_test: if is_test:


# Step 7: Train and evaluate the DeepDRA model on test data # Step 7: Train and evaluate the DeepDRA model on test data
results = train_DeepDRA(X_cell_train, X_cell_test, X_drug_train, X_drug_test, y_train, y_test, cell_sizes,
results = train_DeepDRA(x_cell_train, x_cell_test, x_drug_train, x_drug_test, y_train, y_test, cell_sizes,
drug_sizes, device) drug_sizes, device)

else: else:
# Step 8: Split the data into training and validation sets
X_cell_train, X_cell_test, X_drug_train, X_drug_test, y_train, y_test = train_test_split(X_cell_train,
X_drug_train, y_train,
test_size=0.2,
random_state=RANDOM_SEED,
shuffle=True)
# Step 9: Train and evaluate the DeepDRA model on the split data
results = train_DeepDRA(X_cell_train, X_cell_test, X_drug_train, X_drug_test, y_train, y_test, cell_sizes,
drug_sizes, device)
# # Step 8: Split the data into training and validation sets
# X_cell_train, X_cell_test, X_drug_train, X_drug_test, y_train, y_test = train_test_split(X_cell_train,
# X_drug_train, y_train,
# test_size=0.2,
# random_state=RANDOM_SEED,
# shuffle=True)
# # Step 9: Train and evaluate the DeepDRA model on the split data
# results = train_DeepDRA(X_cell_train, X_cell_test, X_drug_train, X_drug_test, y_train, y_test, cell_sizes,
# drug_sizes, device)

results = cv_train(x_cell_train, x_drug_train, y_train, cell_sizes, drug_sizes, device, k=5)


# Step 10: Add results to the history dictionary # Step 10: Add results to the history dictionary
Evaluation.add_results(history, results) Evaluation.add_results(history, results)

+ 1
- 1
mlp.py View File

nn.Linear(input_dim, 128), nn.Linear(input_dim, 128),
nn.ReLU(inplace=True), nn.ReLU(inplace=True),
nn.Linear(128, output_dim), nn.Linear(128, output_dim),
nn.Hardsigmoid(),
nn.Sigmoid(),
) )


def forward(self, x): def forward(self, x):

+ 2
- 1
utils.py View File

GDSC_RAW_DATA_FOLDER = os.path.join(DATA_FOLDER, 'GDSC_data') GDSC_RAW_DATA_FOLDER = os.path.join(DATA_FOLDER, 'GDSC_data')
CCLE_RAW_DATA_FOLDER = os.path.join(DATA_FOLDER, 'CCLE_data') CCLE_RAW_DATA_FOLDER = os.path.join(DATA_FOLDER, 'CCLE_data')
CTRP_RAW_DATA_FOLDER = os.path.join(DATA_FOLDER, 'CTRP_data') CTRP_RAW_DATA_FOLDER = os.path.join(DATA_FOLDER, 'CTRP_data')
SIM_DATA_FOLDER = os.path.join(DATA_FOLDER, 'similarity_data')


GDSC_SCREENING_DATA_FOLDER = os.path.join(GDSC_RAW_DATA_FOLDER, 'drug_screening_matrix_GDSC.tsv') GDSC_SCREENING_DATA_FOLDER = os.path.join(GDSC_RAW_DATA_FOLDER, 'drug_screening_matrix_GDSC.tsv')
CCLE_SCREENING_DATA_FOLDER = os.path.join(CCLE_RAW_DATA_FOLDER, 'drug_screening_matrix_ccle.tsv') CCLE_SCREENING_DATA_FOLDER = os.path.join(CCLE_RAW_DATA_FOLDER, 'drug_screening_matrix_ccle.tsv')
SAVE_MODEL = False # Change it to True to save the trained model SAVE_MODEL = False # Change it to True to save the trained model
VARIATIONAL_AUTOENCODERS = False VARIATIONAL_AUTOENCODERS = False
# DATA_MODALITIES=['cell_CN','cell_exp','cell_methy','cell_mut','drug_comp','drug_DT'] # Change this list to only consider specific data modalities # DATA_MODALITIES=['cell_CN','cell_exp','cell_methy','cell_mut','drug_comp','drug_DT'] # Change this list to only consider specific data modalities
DATA_MODALITIES = ['cell_CN','cell_exp','cell_mut', 'drug_desc']
DATA_MODALITIES = ['cell_exp', 'drug_desc']
RANDOM_SEED = 42 # Must be used wherever can be used RANDOM_SEED = 42 # Must be used wherever can be used





Loading…
Cancel
Save