|
|
|
|
|
|
|
|
# main_random.py
|
|
|
|
|
|
import argparse
|
|
|
|
|
|
import numpy as np
|
|
|
|
|
|
import pandas as pd
|
|
|
|
|
|
import scipy.sparse as sp
|
|
|
|
|
|
from sklearn.model_selection import KFold
|
|
|
|
|
|
from sklearn.metrics import roc_auc_score, average_precision_score, precision_score, recall_score, f1_score
|
|
|
|
|
|
from model import DeepTraCDR, Optimizer
|
|
|
|
|
|
from utils import evaluate_auc, common_data_index
|
|
|
|
|
|
from data_sampler import TargetSampler
|
|
|
|
|
|
from data_loader import load_data
|
|
|
|
|
|
import torch
|
|
|
|
|
|
from torch.optim.lr_scheduler import OneCycleLR
|
|
|
|
|
|
|
|
|
|
|
|
# Clear CUDA cache to optimize GPU memory usage
|
|
|
|
|
|
torch.cuda.empty_cache()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
|
"""
|
|
|
|
|
|
Main function to execute the DeepTraCDR model training and evaluation pipeline.
|
|
|
|
|
|
Parses command-line arguments, loads data, performs k-fold cross-validation,
|
|
|
|
|
|
and reports performance metrics.
|
|
|
|
|
|
"""
|
|
|
|
|
|
# Initialize argument parser for command-line arguments
|
|
|
|
|
|
parser = argparse.ArgumentParser(description="DeepTraCDR Advanced Model Training")
|
|
|
|
|
|
parser.add_argument('-device', type=str, default="cuda:0" if torch.cuda.is_available() else "cpu",
|
|
|
|
|
|
help="Device to run the model on (cuda:0 or cpu)")
|
|
|
|
|
|
parser.add_argument('-data', type=str, default='ccle', help="Dataset to use (gdsc or ccle)")
|
|
|
|
|
|
parser.add_argument('--wd', type=float, default=1e-4, help="Weight decay for optimizer")
|
|
|
|
|
|
parser.add_argument('--layer_size', nargs='+', type=int, default=[512],
|
|
|
|
|
|
help="List of layer sizes for the GCN model")
|
|
|
|
|
|
parser.add_argument('--gamma', type=float, default=15, help="Gamma parameter for model")
|
|
|
|
|
|
parser.add_argument('--epochs', type=int, default=1000, help="Number of training epochs")
|
|
|
|
|
|
parser.add_argument('--test_freq', type=int, default=50, help="Frequency of evaluation during training")
|
|
|
|
|
|
parser.add_argument('--lr', type=float, default=0.0005, help="Learning rate for optimizer")
|
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
|
|
|
|
# Load target drug data based on the specified dataset
|
|
|
|
|
|
if args.data == "gdsc":
|
|
|
|
|
|
target_drug_cids = np.array([5330286, 11338033, 24825971])
|
|
|
|
|
|
# Load cell-drug binary matrix for GDSC dataset
|
|
|
|
|
|
cell_drug = pd.read_csv("/media/external_16TB_1/ali_kianfar/Data/GDSC/cell_drug_binary.csv",
|
|
|
|
|
|
index_col=0, header=0)
|
|
|
|
|
|
cell_drug.columns = cell_drug.columns.astype(np.int32)
|
|
|
|
|
|
drug_cids = cell_drug.columns.values
|
|
|
|
|
|
cell_target_drug = np.array(cell_drug.loc[:, target_drug_cids], dtype=np.float32)
|
|
|
|
|
|
target_pos_num = sp.coo_matrix(cell_target_drug).data.shape[0]
|
|
|
|
|
|
target_indexes = common_data_index(drug_cids, target_drug_cids)
|
|
|
|
|
|
|
|
|
|
|
|
elif args.data == "ccle":
|
|
|
|
|
|
target_drug_cids = np.array([5330286])
|
|
|
|
|
|
# Load cell-drug binary matrix for CCLE dataset
|
|
|
|
|
|
cell_drug = pd.read_csv("/media/external_16TB_1/ali_kianfar/Data/CCLE/cell_drug_binary.csv",
|
|
|
|
|
|
index_col=0, header=0)
|
|
|
|
|
|
cell_drug.columns = cell_drug.columns.astype(np.int32)
|
|
|
|
|
|
drug_cids = cell_drug.columns.values
|
|
|
|
|
|
cell_target_drug = np.array(cell_drug.loc[:, target_drug_cids], dtype=np.float32)
|
|
|
|
|
|
target_pos_num = sp.coo_matrix(cell_target_drug).data.shape[0]
|
|
|
|
|
|
target_indexes = common_data_index(drug_cids, target_drug_cids)
|
|
|
|
|
|
|
|
|
|
|
|
# Load dataset components including adjacency matrix, fingerprints, and expression data
|
|
|
|
|
|
full_adj, drug_fingerprints, exprs, null_mask, pos_num, args = load_data(args)
|
|
|
|
|
|
full_adj_np = full_adj.copy()
|
|
|
|
|
|
|
|
|
|
|
|
# Log original adjacency matrix shape for debugging
|
|
|
|
|
|
print(f"Original adj_mat shape: {full_adj.shape}")
|
|
|
|
|
|
|
|
|
|
|
|
# Log shapes of loaded data for verification
|
|
|
|
|
|
print("\n--- Data Shapes ---")
|
|
|
|
|
|
print(f"Expression data shape: {exprs.shape}")
|
|
|
|
|
|
print(f"Null mask shape: {null_mask.shape}")
|
|
|
|
|
|
|
|
|
|
|
|
# Convert adjacency matrix to PyTorch tensor if it is a NumPy array
|
|
|
|
|
|
if isinstance(full_adj, np.ndarray):
|
|
|
|
|
|
full_adj = torch.from_numpy(full_adj).float().to(args.device)
|
|
|
|
|
|
|
|
|
|
|
|
# Log converted adjacency matrix shape for verification
|
|
|
|
|
|
print(f"Converted adj_mat shape: {full_adj.shape}")
|
|
|
|
|
|
|
|
|
|
|
|
# Initialize k-fold cross-validation parameters
|
|
|
|
|
|
k = 5
|
|
|
|
|
|
n_kfolds = 5
|
|
|
|
|
|
all_metrics = {
|
|
|
|
|
|
'auc': [],
|
|
|
|
|
|
'auprc': [],
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# Perform k-fold cross-validation
|
|
|
|
|
|
for n_kfold in range(n_kfolds):
|
|
|
|
|
|
kfold = KFold(n_splits=k, shuffle=True, random_state=n_kfold)
|
|
|
|
|
|
for fold, (train_idx, test_idx) in enumerate(kfold.split(np.arange(target_pos_num))):
|
|
|
|
|
|
# Initialize data sampler for train/test split
|
|
|
|
|
|
sampler = TargetSampler(
|
|
|
|
|
|
response_mat=full_adj_np,
|
|
|
|
|
|
null_mask=null_mask,
|
|
|
|
|
|
target_indexes=target_indexes,
|
|
|
|
|
|
pos_train_index=train_idx,
|
|
|
|
|
|
pos_test_index=test_idx
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
# Initialize DeepTraCDR model
|
|
|
|
|
|
model = DeepTraCDR(
|
|
|
|
|
|
adj_mat=full_adj,
|
|
|
|
|
|
cell_exprs=exprs,
|
|
|
|
|
|
drug_finger=drug_fingerprints,
|
|
|
|
|
|
layer_size=args.layer_size,
|
|
|
|
|
|
gamma=args.gamma,
|
|
|
|
|
|
device=args.device
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
# Initialize optimizer for model training
|
|
|
|
|
|
opt = Optimizer(
|
|
|
|
|
|
model=model,
|
|
|
|
|
|
train_data=sampler.train_data,
|
|
|
|
|
|
test_data=sampler.test_data,
|
|
|
|
|
|
test_mask=sampler.test_mask,
|
|
|
|
|
|
train_mask=sampler.train_mask,
|
|
|
|
|
|
adj_matrix=full_adj,
|
|
|
|
|
|
evaluate_fun=evaluate_auc,
|
|
|
|
|
|
lr=args.lr,
|
|
|
|
|
|
wd=args.wd,
|
|
|
|
|
|
epochs=args.epochs,
|
|
|
|
|
|
test_freq=args.test_freq,
|
|
|
|
|
|
device=args.device
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
# Train model and retrieve evaluation metrics
|
|
|
|
|
|
true, pred, best_auc, best_auprc = opt.train()
|
|
|
|
|
|
|
|
|
|
|
|
# Store metrics for this fold
|
|
|
|
|
|
all_metrics['auc'].append(best_auc)
|
|
|
|
|
|
all_metrics['auprc'].append(best_auprc)
|
|
|
|
|
|
|
|
|
|
|
|
# Log performance for the current fold
|
|
|
|
|
|
print(f"Fold {n_kfold * k + fold + 1}: AUC={best_auc:.4f}, AUPRC={best_auprc:.4f}")
|
|
|
|
|
|
|
|
|
|
|
|
# Calculate and log mean and standard deviation of metrics
|
|
|
|
|
|
print(f"\nFinal Average Metrics:")
|
|
|
|
|
|
for metric, values in all_metrics.items():
|
|
|
|
|
|
mean = np.mean(values)
|
|
|
|
|
|
std = np.std(values)
|
|
|
|
|
|
print(f"{metric.upper()}: {mean:.4f} ± {std:.4f}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
|
# Set precision for matrix multiplication to optimize performance
|
|
|
|
|
|
torch.set_float32_matmul_precision('high')
|
|
|
|
|
|
|
|
|
# main_target.py |
|
|
|
|
|
import argparse |
|
|
|
|
|
import numpy as np |
|
|
|
|
|
import pandas as pd |
|
|
|
|
|
import scipy.sparse as sp |
|
|
|
|
|
from sklearn.model_selection import KFold |
|
|
|
|
|
from sklearn.metrics import roc_auc_score, average_precision_score, precision_score, recall_score, f1_score |
|
|
|
|
|
from model import DeepTraCDR, ModelOptimizer |
|
|
|
|
|
from utils import evaluate_auc, common_data_index |
|
|
|
|
|
from data_sampler import TargetSampler |
|
|
|
|
|
from data_loader import load_data |
|
|
|
|
|
import torch |
|
|
|
|
|
from torch.optim.lr_scheduler import OneCycleLR |
|
|
|
|
|
|
|
|
|
|
|
# Clear CUDA cache to optimize GPU memory usage |
|
|
|
|
|
torch.cuda.empty_cache() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main(): |
|
|
|
|
|
""" |
|
|
|
|
|
Main function to execute the DeepTraCDR model training and evaluation pipeline. |
|
|
|
|
|
Parses command-line arguments, loads data, performs k-fold cross-validation, |
|
|
|
|
|
and reports performance metrics. |
|
|
|
|
|
""" |
|
|
|
|
|
# Initialize argument parser for command-line arguments |
|
|
|
|
|
parser = argparse.ArgumentParser(description="DeepTraCDR Advanced Model Training") |
|
|
|
|
|
parser.add_argument('-device', type=str, default="cuda:0" if torch.cuda.is_available() else "cpu", |
|
|
|
|
|
help="Device to run the model on (cuda:0 or cpu)") |
|
|
|
|
|
parser.add_argument('-data', type=str, default='gdsc', help="Dataset to use (gdsc or ccle)") |
|
|
|
|
|
parser.add_argument('--wd', type=float, default=1e-4, help="Weight decay for optimizer") |
|
|
|
|
|
parser.add_argument('--layer_size', nargs='+', type=int, default=[512], |
|
|
|
|
|
help="List of layer sizes for the GCN model") |
|
|
|
|
|
parser.add_argument('--gamma', type=float, default=15, help="Gamma parameter for model") |
|
|
|
|
|
parser.add_argument('--epochs', type=int, default=1000, help="Number of training epochs") |
|
|
|
|
|
parser.add_argument('--test_freq', type=int, default=50, help="Frequency of evaluation during training") |
|
|
|
|
|
parser.add_argument('--lr', type=float, default=0.0005, help="Learning rate for optimizer") |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
|
|
|
|
|
|
|
|
# Load target drug data based on the specified dataset |
|
|
|
|
|
if args.data == "gdsc": |
|
|
|
|
|
target_drug_cids = np.array([5330286, 11338033, 24825971]) |
|
|
|
|
|
# Load cell-drug binary matrix for GDSC dataset |
|
|
|
|
|
cell_drug = pd.read_csv("/media/external_16TB_1/ali_kianfar/Data/GDSC/cell_drug_binary.csv", |
|
|
|
|
|
index_col=0, header=0) |
|
|
|
|
|
cell_drug.columns = cell_drug.columns.astype(np.int32) |
|
|
|
|
|
drug_cids = cell_drug.columns.values |
|
|
|
|
|
cell_target_drug = np.array(cell_drug.loc[:, target_drug_cids], dtype=np.float32) |
|
|
|
|
|
target_pos_num = sp.coo_matrix(cell_target_drug).data.shape[0] |
|
|
|
|
|
target_indexes = common_data_index(drug_cids, target_drug_cids) |
|
|
|
|
|
|
|
|
|
|
|
elif args.data == "ccle": |
|
|
|
|
|
target_drug_cids = np.array([5330286]) |
|
|
|
|
|
# Load cell-drug binary matrix for CCLE dataset |
|
|
|
|
|
cell_drug = pd.read_csv("/media/external_16TB_1/ali_kianfar/Data/CCLE/cell_drug_binary.csv", |
|
|
|
|
|
index_col=0, header=0) |
|
|
|
|
|
cell_drug.columns = cell_drug.columns.astype(np.int32) |
|
|
|
|
|
drug_cids = cell_drug.columns.values |
|
|
|
|
|
cell_target_drug = np.array(cell_drug.loc[:, target_drug_cids], dtype=np.float32) |
|
|
|
|
|
target_pos_num = sp.coo_matrix(cell_target_drug).data.shape[0] |
|
|
|
|
|
target_indexes = common_data_index(drug_cids, target_drug_cids) |
|
|
|
|
|
|
|
|
|
|
|
# Load dataset components including adjacency matrix, fingerprints, and expression data |
|
|
|
|
|
full_adj, drug_fingerprints, exprs, null_mask, pos_num, args = load_data(args) |
|
|
|
|
|
full_adj_np = full_adj.copy() |
|
|
|
|
|
|
|
|
|
|
|
# Log original adjacency matrix shape for debugging |
|
|
|
|
|
print(f"Original adj_mat shape: {full_adj.shape}") |
|
|
|
|
|
|
|
|
|
|
|
# Log shapes of loaded data for verification |
|
|
|
|
|
print("\n--- Data Shapes ---") |
|
|
|
|
|
print(f"Expression data shape: {exprs.shape}") |
|
|
|
|
|
print(f"Null mask shape: {null_mask.shape}") |
|
|
|
|
|
|
|
|
|
|
|
# Convert adjacency matrix to PyTorch tensor if it is a NumPy array |
|
|
|
|
|
if isinstance(full_adj, np.ndarray): |
|
|
|
|
|
full_adj = torch.from_numpy(full_adj).float().to(args.device) |
|
|
|
|
|
|
|
|
|
|
|
# Log converted adjacency matrix shape for verification |
|
|
|
|
|
print(f"Converted adj_mat shape: {full_adj.shape}") |
|
|
|
|
|
|
|
|
|
|
|
# Initialize k-fold cross-validation parameters |
|
|
|
|
|
k = 5 |
|
|
|
|
|
n_kfolds = 5 |
|
|
|
|
|
all_metrics = { |
|
|
|
|
|
'auc': [], |
|
|
|
|
|
'auprc': [], |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
# Perform k-fold cross-validation |
|
|
|
|
|
for n_kfold in range(n_kfolds): |
|
|
|
|
|
kfold = KFold(n_splits=k, shuffle=True, random_state=n_kfold) |
|
|
|
|
|
for fold, (train_idx, test_idx) in enumerate(kfold.split(np.arange(target_pos_num))): |
|
|
|
|
|
# Initialize data sampler for train/test split |
|
|
|
|
|
sampler = TargetSampler(response_mat=full_adj_np, null_mask=null_mask, target_indexes=target_indexes, |
|
|
|
|
|
pos_train_index=train_idx, pos_test_index=test_idx) |
|
|
|
|
|
|
|
|
|
|
|
# Initialize DeepTraCDR model |
|
|
|
|
|
model = DeepTraCDR( |
|
|
|
|
|
adj_mat=full_adj, |
|
|
|
|
|
cell_exprs=exprs, |
|
|
|
|
|
drug_fingerprints=drug_fingerprints, |
|
|
|
|
|
layer_size=args.layer_size, |
|
|
|
|
|
gamma=args.gamma, |
|
|
|
|
|
device=args.device |
|
|
|
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
# Initialize optimizer for model training |
|
|
|
|
|
opt = ModelOptimizer( |
|
|
|
|
|
model=model, |
|
|
|
|
|
train_data=sampler.train_data, |
|
|
|
|
|
test_data=sampler.test_data, |
|
|
|
|
|
test_mask=sampler.test_mask, |
|
|
|
|
|
train_mask=sampler.train_mask, |
|
|
|
|
|
adj_matrix=full_adj, |
|
|
|
|
|
evaluate_fun=evaluate_auc, |
|
|
|
|
|
lr=args.lr, |
|
|
|
|
|
wd=args.wd, |
|
|
|
|
|
epochs=args.epochs, |
|
|
|
|
|
test_freq=args.test_freq, |
|
|
|
|
|
device=args.device |
|
|
|
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
# Train model and retrieve evaluation metrics |
|
|
|
|
|
true, pred, best_auc, best_auprc = opt.train() |
|
|
|
|
|
|
|
|
|
|
|
# Store metrics for this fold |
|
|
|
|
|
all_metrics['auc'].append(best_auc) |
|
|
|
|
|
all_metrics['auprc'].append(best_auprc) |
|
|
|
|
|
|
|
|
|
|
|
# Log performance for the current fold |
|
|
|
|
|
print(f"Fold {n_kfold * k + fold + 1}: AUC={best_auc:.4f}, AUPRC={best_auprc:.4f}") |
|
|
|
|
|
|
|
|
|
|
|
# Calculate and log mean and standard deviation of metrics |
|
|
|
|
|
print(f"\nFinal Average Metrics:") |
|
|
|
|
|
for metric, values in all_metrics.items(): |
|
|
|
|
|
mean = np.mean(values) |
|
|
|
|
|
std = np.std(values) |
|
|
|
|
|
print(f"{metric.upper()}: {mean:.4f} ± {std:.4f}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
# Set precision for matrix multiplication to optimize performance |
|
|
|
|
|
torch.set_float32_matmul_precision('high') |
|
|
main() |
|
|
main() |