@@ -5,4 +5,6 @@ drug/data/DDI/DrugBank/raw/Drugbank_drug_interactions.tsv | |||
drug/data/DDI/SNAP Stanford/ChCh-Miner_durgbank-chem-chem.tsv | |||
cell/data/DTI/SNAP Stanford/ChG-Miner_miner-chem-gene.tsv | |||
cell/data/DTI/SNAP Stanford/ChG-Miner_miner-chem-gene.tsv.gz | |||
drug/data/Smiles/drugbank_all_structure_links.csv.zip | |||
drug/data/Smiles/drugbank_all_structure_links.csv.zip | |||
*.pyc | |||
predictor/test.py |
@@ -5913,4 +5913,4 @@ DB12264 5910 | |||
DB06614 5911 | |||
DB09047 5912 | |||
DB11074 5913 | |||
DB00878 5914 | |||
DB00878 5914 |
@@ -1,40 +1,39 @@ | |||
drug_name drug_bank_id | |||
5-FU DB00544 | |||
ABT-888 DB07232 | |||
AZD1775 DB11740 | |||
BEZ-235 DB11651 | |||
BORTEZOMIB DB00188 | |||
CARBOPLATIN DB00958 | |||
CYCLOPHOSPHAMIDE DB00531 | |||
DASATINIB DB01254 | |||
DEXAMETHASONE DB01234 | |||
DINACICLIB DB12021 | |||
DOXORUBICIN DB00997 | |||
ERLOTINIB DB00530 | |||
ETOPOSIDE DB00773 | |||
GELDANAMYCIN DB02424 | |||
drug_name drug_bank_id | |||
5-FU DB00544 | |||
ABT-888 DB07232 | |||
AZD1775 DB11740 | |||
BEZ-235 DB11651 | |||
BORTEZOMIB DB00188 | |||
CARBOPLATIN DB00958 | |||
CYCLOPHOSPHAMIDE DB00531 | |||
DASATINIB DB01254 | |||
DEXAMETHASONE DB01234 | |||
DINACICLIB DB12021 | |||
DOXORUBICIN DB00997 | |||
ERLOTINIB DB00530 | |||
ETOPOSIDE DB00773 | |||
GELDANAMYCIN DB02424 | |||
GEMCITABINE DB00441 | |||
L778123 DB07227 | |||
LAPATINIB DB01259 | |||
METFORMIN DB00331 | |||
METHOTREXATE DB00563 | |||
MITOMYCINE DB00305 | |||
MK-2206 DB16828 | |||
MK-4541 DB17016 | |||
MK-4827 DB11793 | |||
MK-5108 DB12556 | |||
MK-8669 DB06233 | |||
MK-8776 DB11899 | |||
MRK-003 DB17015 | |||
OXALIPLATIN DB00526 | |||
PACLITAXEL DB01229 | |||
PD325901 DB07101 | |||
SN-38 DB05482 | |||
SORAFENIB DB00398 | |||
SUNITINIB DB01268 | |||
TEMOZOLOMIDE DB00853 | |||
TOPOTECAN DB01030 | |||
VINBLASTINE DB00570 | |||
VINORELBINE DB00361 | |||
ZOLINZA DB02546 | |||
LAPATINIB DB01259 | |||
METFORMIN DB00331 | |||
METHOTREXATE DB00563 | |||
MITOMYCINE DB00305 | |||
MK-2206 DB16828 | |||
MK-4541 DB17016 | |||
MK-4827 DB11793 | |||
MK-5108 DB12556 | |||
MK-8669 DB06233 | |||
MK-8776 DB11899 | |||
MRK-003 DB17015 | |||
OXALIPLATIN DB00526 | |||
PACLITAXEL DB01229 | |||
PD325901 DB07101 | |||
SN-38 DB05482 | |||
SORAFENIB DB00398 | |||
SUNITINIB DB01268 | |||
TEMOZOLOMIDE DB00853 | |||
TOPOTECAN DB01030 | |||
VINBLASTINE DB00570 | |||
VINORELBINE DB00361 | |||
ZOLINZA DB02546 |
@@ -8,7 +8,7 @@ import random | |||
class DDInteractionDataset(Dataset): | |||
def __init__(self, root, transform=None, pre_transform=None, pre_filter=None): | |||
def __init__(self, root = "drug/data/", transform=None, pre_transform=None, pre_filter=None): | |||
super(DDInteractionDataset, self).__init__(root, transform, pre_transform, pre_filter) | |||
@property | |||
@@ -48,8 +48,9 @@ class DDInteractionDataset(Dataset): | |||
def generate_rand_fp(self): | |||
number = random.getrandbits(256) | |||
# Convert the number to binary | |||
binary_string = format(number, '0b') | |||
binary_string = '{0:0256b}'.format(number) | |||
random_fp = [x for x in binary_string] | |||
random_fp = list(map(int, random_fp)) | |||
return random_fp | |||
@@ -59,6 +60,7 @@ class DDInteractionDataset(Dataset): | |||
drug_fp_df = pd.read_csv(drug_fp_path) | |||
node_features = list() | |||
node_ids = list() | |||
for i in range(num_nodes): | |||
drugbankid = self.find_drugBank_id(i) | |||
fp = drug_fp_df.loc[drug_fp_df['DrugBank_id'] == drugbankid] | |||
@@ -68,18 +70,19 @@ class DDInteractionDataset(Dataset): | |||
fp = list(fp.to_numpy()[0,1:]) | |||
node_features.append(fp) | |||
node_ids.append(drugbankid) | |||
self.num_features = len(node_features[0]) | |||
return node_features | |||
return node_ids, node_features | |||
def process(self): | |||
path = osp.join(self.raw_dir, self.raw_file_names[0]) | |||
ddi = pd.read_csv(path , sep='\t') | |||
edge_index = torch.tensor([ddi['drug1_idx'],ddi['drug2_idx']], dtype=torch.long) | |||
num_nodes = ddi['drug1_idx'].max() + 1 | |||
node_features = self.read_node_features(num_nodes) | |||
# TODO: check why sometimes node_features len is less than 256 | |||
node_ids, node_features = self.read_node_features(num_nodes) | |||
node_features = torch.tensor(node_features, dtype=torch.int) | |||
print("node features nrow and ncol: ",len(node_features),len(node_features[0])) | |||
# --------------------------------------------------------------- |
@@ -1,6 +1,10 @@ | |||
import os | |||
# e:\Me\Master\BCB\Thesis\DrugCombModel | |||
# PROJ_DIR = os.path.dirname(os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..'))) | |||
PROJ_DIR = os.path.dirname(os.path.abspath(os.path.dirname(__file__))) | |||
# E:\Me\Master\BCB\Thesis\DrugCombModel\DrugCombinationPredeiction | |||
PROJ_DIR = os.path.dirname(os.path.abspath(os.path.dirname( __file__ ))) | |||
DRUG_DIR = os.path.join(PROJ_DIR, 'drug') | |||
SUB_PROJ_DIR = os.path.join(PROJ_DIR, 'predictor') | |||
DATA_DIR = os.path.join(SUB_PROJ_DIR, 'data') | |||
DRUG_DATA_DIR = os.path.join(PROJ_DIR, 'drug', 'data') | |||
@@ -11,8 +15,9 @@ if not os.path.exists(OUTPUT_DIR): | |||
SYNERGY_FILE = os.path.join(DATA_DIR, 'synergy.tsv') | |||
DRUG_FEAT_FILE = os.path.join(DRUG_DATA_DIR, 'drug_feat.npy') | |||
DRUG2ID_FILE = os.path.join(DRUG_DATA_DIR, 'drug2id.tsv') | |||
# DRUG_FEAT_FILE = os.path.join(DRUG_DATA_DIR, 'drug_feat.npy') | |||
DRUGN2ID_FILE = os.path.join(DRUG_DATA_DIR, 'DDI\DrugBank\\raw\drug2id.tsv') | |||
DRUGNAME_2_DRUGBANKID_FILE = os.path.join(DRUG_DATA_DIR, 'drugname2drugbankid.tsv') | |||
CELL_FEAT_FILE = os.path.join(CELL_DATA_DIR, 'cell_feat.npy') | |||
CELL2ID_FILE = os.path.join(CELL_DATA_DIR, 'cell2id.tsv') | |||
@@ -13,7 +13,7 @@ time_str = str(datetime.now().strftime('%y%m%d%H%M')) | |||
from model.datasets import FastSynergyDataset, FastTensorDataLoader | |||
from model.models import MLP | |||
from model.utils import save_args, save_best_model, find_best_model, arg_min, random_split_indices, calc_stat, conf_inv | |||
from const import SYNERGY_FILE, DRUG2ID_FILE, DRUG_FEAT_FILE, CELL_FEAT_FILE, CELL2ID_FILE, OUTPUT_DIR | |||
from const import SYNERGY_FILE, CELL_FEAT_FILE, CELL2ID_FILE, OUTPUT_DIR, DRUGNAME_2_DRUGBANKID_FILE | |||
def eval_model(model, optimizer, loss_func, train_data, test_data, | |||
@@ -30,15 +30,16 @@ def eval_model(model, optimizer, loss_func, train_data, test_data, | |||
def step_batch(model, batch, loss_func, gpu_id=None, train=True): | |||
drug1_feats, drug2_feats, cell_feats, y_true = batch | |||
drug1_id, drug2_id, cell_feat, y_true = batch | |||
if gpu_id is not None: | |||
drug1_feats, drug2_feats, cell_feats, y_true = drug1_feats.cuda(gpu_id), drug2_feats.cuda(gpu_id), \ | |||
cell_feats.cuda(gpu_id), y_true.cuda(gpu_id) | |||
# drug1_feats, drug2_feats, cell_feats, y_true = drug1_feats.cuda(gpu_id), drug2_feats.cuda(gpu_id), \ | |||
# cell_feats.cuda(gpu_id), y_true.cuda(gpu_id) | |||
pass | |||
if train: | |||
y_pred = model(drug1_feats, drug2_feats, cell_feats) | |||
y_pred = model(drug1_id, drug2_id, cell_feat) | |||
else: | |||
yp1 = model(drug1_feats, drug2_feats, cell_feats) | |||
yp2 = model(drug2_feats, drug1_feats, cell_feats) | |||
yp1 = model(drug1_id, drug2_id, cell_feat) | |||
yp2 = model(drug2_id, drug1_id, cell_feat) | |||
y_pred = (yp1 + yp2) / 2 | |||
loss = loss_func(y_pred, y_true) | |||
return loss | |||
@@ -93,7 +94,8 @@ def train_model(model, optimizer, loss_func, train_loader, valid_loader, n_epoch | |||
def create_model(data, hidden_size, gpu_id=None): | |||
# TODO: use our own MLP model | |||
model = MLP(data.cell_feat_len() + 2 * data.drug_feat_len(), hidden_size) | |||
# get 256 | |||
model = MLP(data.cell_feat_len() + 2 * 256, hidden_size) | |||
if gpu_id is not None: | |||
model = model.cuda(gpu_id) | |||
return model | |||
@@ -126,9 +128,9 @@ def cv(args, out_dir): | |||
for valid_fold in outer_trn_folds: | |||
inner_trn_folds = [x for x in outer_trn_folds if x != valid_fold] | |||
valid_folds = [valid_fold] | |||
train_data = FastSynergyDataset(DRUG2ID_FILE, CELL2ID_FILE, DRUG_FEAT_FILE, CELL_FEAT_FILE, | |||
train_data = FastSynergyDataset(DRUGNAME_2_DRUGBANKID_FILE, CELL2ID_FILE, CELL_FEAT_FILE, | |||
SYNERGY_FILE, use_folds=inner_trn_folds) | |||
valid_data = FastSynergyDataset(DRUG2ID_FILE, CELL2ID_FILE, DRUG_FEAT_FILE, CELL_FEAT_FILE, | |||
valid_data = FastSynergyDataset(DRUGNAME_2_DRUGBANKID_FILE, CELL2ID_FILE, CELL_FEAT_FILE, | |||
SYNERGY_FILE, use_folds=valid_folds, train=False) | |||
train_loader = FastTensorDataLoader(*train_data.tensor_samples(), batch_size=args.batch, | |||
shuffle=True) | |||
@@ -150,9 +152,9 @@ def cv(args, out_dir): | |||
time.sleep(10) | |||
min_ls, min_idx = arg_min(losses) | |||
best_hs, best_lr = param[min_idx] | |||
train_data = FastSynergyDataset(DRUG2ID_FILE, CELL2ID_FILE, DRUG_FEAT_FILE, CELL_FEAT_FILE, | |||
train_data = FastSynergyDataset(DRUGNAME_2_DRUGBANKID_FILE, CELL2ID_FILE, CELL_FEAT_FILE, | |||
SYNERGY_FILE, use_folds=outer_trn_folds) | |||
test_data = FastSynergyDataset(DRUG2ID_FILE, CELL2ID_FILE, DRUG_FEAT_FILE, CELL_FEAT_FILE, | |||
test_data = FastSynergyDataset(DRUGNAME_2_DRUGBANKID_FILE, CELL2ID_FILE, CELL_FEAT_FILE, | |||
SYNERGY_FILE, use_folds=[test_fold], train=False) | |||
model = create_model(train_data, best_hs, gpu_id) | |||
optimizer = torch.optim.Adam(model.parameters(), lr=best_lr) |
@@ -4,7 +4,7 @@ import torch | |||
import random | |||
from torch.utils.data import Dataset | |||
from .utils import read_map | |||
from .utils import read_map, get_index_by_name | |||
class FastTensorDataLoader: | |||
""" | |||
@@ -55,11 +55,10 @@ class FastTensorDataLoader: | |||
class FastSynergyDataset(Dataset): | |||
def __init__(self, drug2id_file, cell2id_file, drug_feat_file, cell_feat_file, synergy_score_file, use_folds, | |||
def __init__(self, drugname2drugbankid_file, cell2id_file, cell_feat_file, synergy_score_file, use_folds, | |||
train=True): | |||
self.drug2id = read_map(drug2id_file) | |||
self.drug2id = read_map(drugname2drugbankid_file, keep_str = True) | |||
self.cell2id = read_map(cell2id_file) | |||
self.drug_feat = np.load(drug_feat_file) | |||
self.cell_feat = np.load(cell_feat_file) | |||
self.samples = [] | |||
self.raw_samples = [] | |||
@@ -72,25 +71,29 @@ class FastSynergyDataset(Dataset): | |||
drug1, drug2, cellname, score, fold = line.rstrip().split('\t') | |||
if drug1 in valid_drugs and drug2 in valid_drugs and cellname in valid_cells: | |||
if int(fold) in use_folds: | |||
drug1_id = get_index_by_name(drug1) | |||
drug2_id = get_index_by_name(drug2) | |||
sample = [ | |||
# TODO: specify drug_feat | |||
torch.from_numpy(self.drug_feat[self.drug2id[drug1]]).float(), | |||
torch.from_numpy(self.drug_feat[self.drug2id[drug2]]).float(), | |||
# TODO: specify drug_feat | |||
# drug1_feat + drug2_feat + cell_feat + score | |||
torch.IntTensor([drug1_id]), | |||
torch.IntTensor([drug2_id]), | |||
torch.from_numpy(self.cell_feat[self.cell2id[cellname]]).float(), | |||
torch.FloatTensor([float(score)]), | |||
] | |||
# print(sample) | |||
self.samples.append(sample) | |||
raw_sample = [self.drug2id[drug1], self.drug2id[drug2], self.cell2id[cellname], score] | |||
raw_sample = [drug1_id, drug2_id, self.cell2id[cellname], score] | |||
self.raw_samples.append(raw_sample) | |||
if train: | |||
sample = [ | |||
torch.from_numpy(self.drug_feat[self.drug2id[drug2]]).float(), | |||
torch.from_numpy(self.drug_feat[self.drug2id[drug1]]).float(), | |||
torch.IntTensor([drug2_id]), | |||
torch.IntTensor([drug1_id]), | |||
torch.from_numpy(self.cell_feat[self.cell2id[cellname]]).float(), | |||
torch.FloatTensor([float(score)]), | |||
] | |||
self.samples.append(sample) | |||
raw_sample = [self.drug2id[drug2], self.drug2id[drug1], self.cell2id[cellname], score] | |||
raw_sample = [drug2_id, drug1_id, self.cell2id[cellname], score] | |||
self.raw_samples.append(raw_sample) | |||
def __len__(self): | |||
@@ -99,15 +102,17 @@ class FastSynergyDataset(Dataset): | |||
def __getitem__(self, item): | |||
return self.samples[item] | |||
def drug_feat_len(self): | |||
return self.drug_feat.shape[-1] | |||
def cell_feat_len(self): | |||
return self.cell_feat.shape[-1] | |||
def tensor_samples(self, indices=None): | |||
if indices is None: | |||
indices = list(range(len(self))) | |||
# print('-----------------------------') | |||
# print(self.samples) | |||
# print('-----------------') | |||
print(self.samples[0]) | |||
print(self.samples[i][0] and self.samples[i][1] for i in indices) | |||
d1 = torch.cat([torch.unsqueeze(self.samples[i][0], 0) for i in indices], dim=0) | |||
d2 = torch.cat([torch.unsqueeze(self.samples[i][1], 0) for i in indices], dim=0) | |||
c = torch.cat([torch.unsqueeze(self.samples[i][2], 0) for i in indices], dim=0) |
@@ -1,17 +1,36 @@ | |||
import torch | |||
import torch.nn as nn | |||
import torch.nn.functional as F | |||
import os | |||
import sys | |||
PROJ_DIR = os.path.dirname(os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..'))) | |||
sys.path.insert(0, PROJ_DIR) | |||
from drug.models import GCN | |||
from drug.datasets import DDInteractionDataset | |||
class Connector(nn.Module): | |||
def __init(self): | |||
def __init__(self): | |||
super(Connector, self).__init__() | |||
#GCN | |||
self.ddiDataset = DDInteractionDataset() | |||
self.gcn = GCN(self.ddiDataset.num_features, self.ddiDataset.num_features // 2) | |||
#Cell line features | |||
# np.load('cell_feat.npy') | |||
def forward(self, drug1_idx, drug2_idx, cell_feat): | |||
x = self.ddiDataset.get().x | |||
edge_index = self.ddiDataset.get().edge_index | |||
x = self.gcn(x, edge_index) | |||
drug1_feat = x[drug1_idx] | |||
drug2_feat = x[drug2_idx] | |||
feat = torch.cat([drug1_feat, drug2_feat, cell_feat], 1) | |||
def forward(self): | |||
pass | |||
return feat | |||
class MLP(nn.Module): | |||
@@ -26,9 +45,11 @@ class MLP(nn.Module): | |||
nn.BatchNorm1d(hidden_size // 2), | |||
nn.Linear(hidden_size // 2, 1) | |||
) | |||
self.connector = Connector() | |||
def forward(self, drug1_feat: torch.Tensor, drug2_feat: torch.Tensor, cell_feat: torch.Tensor): | |||
feat = torch.cat([drug1_feat, drug2_feat, cell_feat], 1) | |||
def forward(self, drug1_idx, drug2_idx, cell_feat): # prev input: self, drug1_feat: torch.Tensor, drug2_feat: torch.Tensor, cell_feat: torch.Tensor | |||
feat = self.connector(drug1_idx, drug2_idx, cell_feat) | |||
out = self.layers(feat) | |||
return out | |||
@@ -48,13 +48,17 @@ def save_args(args, save_to: str): | |||
json.dump(args_dict, f, indent=2) | |||
def read_map(map_file): | |||
def read_map(map_file, keep_str = False): | |||
d = {} | |||
print(map_file) | |||
with open(map_file, 'r') as f: | |||
f.readline() | |||
for line in f: | |||
k, v = line.rstrip().split('\t') | |||
d[k] = int(v) | |||
k, v = line.rstrip().split() | |||
if keep_str: | |||
d[k] = v | |||
else: | |||
d[k] = int(v) | |||
return d | |||
@@ -80,12 +84,17 @@ def get_index_by_name(drug_name): | |||
project_path = os.path.dirname(os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..'))) | |||
drugname2drugbankid_file = os.path.join(project_path, 'drug/data/drugname2drugbankid.tsv') | |||
drug_name2drugbank_id_df = pd.read_csv(drugname2drugbankid_file , sep='\t') | |||
drug_name2drugbank_id_df = pd.read_csv(drugname2drugbankid_file, sep='\s+') | |||
drug_bank_id = drug_name2drugbank_id_df[drug_name2drugbank_id_df['drug_name'] == drug_name].drug_bank_id.item() | |||
drug2id_file = os.path.join(project_path, 'drug/data/DDI/DrugBank/raw/', 'drug2id.tsv') | |||
drug2id_df = pd.read_csv(drug2id_file , sep='\t') | |||
drug_index = drug2id_df[drug2id_df['DrugBank_id'] == drug_bank_id].node_index.item() | |||
row = drug2id_df[drug2id_df['DrugBank_id'] == drug_bank_id] | |||
if row.empty: | |||
drug_index = -1 | |||
else: | |||
drug_index = row.node_index.item() | |||
return drug_index |