Browse Source

integrate different parts for the first run

main
MahsaYazdani 2 years ago
parent
commit
ef6cb9f74d

+ 3
- 1
.gitignore View File

@@ -5,4 +5,6 @@ drug/data/DDI/DrugBank/raw/Drugbank_drug_interactions.tsv
drug/data/DDI/SNAP Stanford/ChCh-Miner_durgbank-chem-chem.tsv
cell/data/DTI/SNAP Stanford/ChG-Miner_miner-chem-gene.tsv
cell/data/DTI/SNAP Stanford/ChG-Miner_miner-chem-gene.tsv.gz
drug/data/Smiles/drugbank_all_structure_links.csv.zip
drug/data/Smiles/drugbank_all_structure_links.csv.zip
*.pyc
predictor/test.py

BIN
drug/data/DDI/DrugBank/processed/ddi_graph_dataset.pt View File


+ 1
- 1
drug/data/DDI/DrugBank/raw/drug2id.tsv View File

@@ -5913,4 +5913,4 @@ DB12264 5910
DB06614 5911
DB09047 5912
DB11074 5913
DB00878 5914
DB00878 5914

+ 37
- 38
drug/data/drugname2drugbankid.tsv View File

@@ -1,40 +1,39 @@
drug_name drug_bank_id
5-FU DB00544
ABT-888 DB07232
AZD1775 DB11740
BEZ-235 DB11651
BORTEZOMIB DB00188
CARBOPLATIN DB00958
CYCLOPHOSPHAMIDE DB00531
DASATINIB DB01254
DEXAMETHASONE DB01234
DINACICLIB DB12021
DOXORUBICIN DB00997
ERLOTINIB DB00530
ETOPOSIDE DB00773
GELDANAMYCIN DB02424
drug_name drug_bank_id
5-FU DB00544
ABT-888 DB07232
AZD1775 DB11740
BEZ-235 DB11651
BORTEZOMIB DB00188
CARBOPLATIN DB00958
CYCLOPHOSPHAMIDE DB00531
DASATINIB DB01254
DEXAMETHASONE DB01234
DINACICLIB DB12021
DOXORUBICIN DB00997
ERLOTINIB DB00530
ETOPOSIDE DB00773
GELDANAMYCIN DB02424
GEMCITABINE DB00441
L778123 DB07227
LAPATINIB DB01259
METFORMIN DB00331
METHOTREXATE DB00563
MITOMYCINE DB00305
MK-2206 DB16828
MK-4541 DB17016
MK-4827 DB11793
MK-5108 DB12556
MK-8669 DB06233
MK-8776 DB11899
MRK-003 DB17015
OXALIPLATIN DB00526
PACLITAXEL DB01229
PD325901 DB07101
SN-38 DB05482
SORAFENIB DB00398
SUNITINIB DB01268
TEMOZOLOMIDE DB00853
TOPOTECAN DB01030
VINBLASTINE DB00570
VINORELBINE DB00361
ZOLINZA DB02546

LAPATINIB DB01259
METFORMIN DB00331
METHOTREXATE DB00563
MITOMYCINE DB00305
MK-2206 DB16828
MK-4541 DB17016
MK-4827 DB11793
MK-5108 DB12556
MK-8669 DB06233
MK-8776 DB11899
MRK-003 DB17015
OXALIPLATIN DB00526
PACLITAXEL DB01229
PD325901 DB07101
SN-38 DB05482
SORAFENIB DB00398
SUNITINIB DB01268
TEMOZOLOMIDE DB00853
TOPOTECAN DB01030
VINBLASTINE DB00570
VINORELBINE DB00361
ZOLINZA DB02546

+ 8
- 5
drug/datasets.py View File

@@ -8,7 +8,7 @@ import random


class DDInteractionDataset(Dataset):
def __init__(self, root, transform=None, pre_transform=None, pre_filter=None):
def __init__(self, root = "drug/data/", transform=None, pre_transform=None, pre_filter=None):
super(DDInteractionDataset, self).__init__(root, transform, pre_transform, pre_filter)

@property
@@ -48,8 +48,9 @@ class DDInteractionDataset(Dataset):
def generate_rand_fp(self):
number = random.getrandbits(256)

# Convert the number to binary
binary_string = format(number, '0b')
binary_string = '{0:0256b}'.format(number)
random_fp = [x for x in binary_string]
random_fp = list(map(int, random_fp))
return random_fp
@@ -59,6 +60,7 @@ class DDInteractionDataset(Dataset):
drug_fp_df = pd.read_csv(drug_fp_path)

node_features = list()
node_ids = list()
for i in range(num_nodes):
drugbankid = self.find_drugBank_id(i)
fp = drug_fp_df.loc[drug_fp_df['DrugBank_id'] == drugbankid]
@@ -68,18 +70,19 @@ class DDInteractionDataset(Dataset):
fp = list(fp.to_numpy()[0,1:])

node_features.append(fp)
node_ids.append(drugbankid)

self.num_features = len(node_features[0])

return node_features
return node_ids, node_features

def process(self):
path = osp.join(self.raw_dir, self.raw_file_names[0])
ddi = pd.read_csv(path , sep='\t')
edge_index = torch.tensor([ddi['drug1_idx'],ddi['drug2_idx']], dtype=torch.long)
num_nodes = ddi['drug1_idx'].max() + 1
node_features = self.read_node_features(num_nodes)
# TODO: check why sometimes node_features len is less than 256
node_ids, node_features = self.read_node_features(num_nodes)
node_features = torch.tensor(node_features, dtype=torch.int)
print("node features nrow and ncol: ",len(node_features),len(node_features[0]))

# ---------------------------------------------------------------

+ 8
- 3
predictor/const.py View File

@@ -1,6 +1,10 @@
import os
# e:\Me\Master\BCB\Thesis\DrugCombModel
# PROJ_DIR = os.path.dirname(os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..')))

PROJ_DIR = os.path.dirname(os.path.abspath(os.path.dirname(__file__)))
# E:\Me\Master\BCB\Thesis\DrugCombModel\DrugCombinationPredeiction
PROJ_DIR = os.path.dirname(os.path.abspath(os.path.dirname( __file__ )))
DRUG_DIR = os.path.join(PROJ_DIR, 'drug')
SUB_PROJ_DIR = os.path.join(PROJ_DIR, 'predictor')
DATA_DIR = os.path.join(SUB_PROJ_DIR, 'data')
DRUG_DATA_DIR = os.path.join(PROJ_DIR, 'drug', 'data')
@@ -11,8 +15,9 @@ if not os.path.exists(OUTPUT_DIR):

SYNERGY_FILE = os.path.join(DATA_DIR, 'synergy.tsv')

DRUG_FEAT_FILE = os.path.join(DRUG_DATA_DIR, 'drug_feat.npy')
DRUG2ID_FILE = os.path.join(DRUG_DATA_DIR, 'drug2id.tsv')
# DRUG_FEAT_FILE = os.path.join(DRUG_DATA_DIR, 'drug_feat.npy')
DRUGN2ID_FILE = os.path.join(DRUG_DATA_DIR, 'DDI\DrugBank\\raw\drug2id.tsv')
DRUGNAME_2_DRUGBANKID_FILE = os.path.join(DRUG_DATA_DIR, 'drugname2drugbankid.tsv')
CELL_FEAT_FILE = os.path.join(CELL_DATA_DIR, 'cell_feat.npy')
CELL2ID_FILE = os.path.join(CELL_DATA_DIR, 'cell2id.tsv')


+ 14
- 12
predictor/cross_validation.py View File

@@ -13,7 +13,7 @@ time_str = str(datetime.now().strftime('%y%m%d%H%M'))
from model.datasets import FastSynergyDataset, FastTensorDataLoader
from model.models import MLP
from model.utils import save_args, save_best_model, find_best_model, arg_min, random_split_indices, calc_stat, conf_inv
from const import SYNERGY_FILE, DRUG2ID_FILE, DRUG_FEAT_FILE, CELL_FEAT_FILE, CELL2ID_FILE, OUTPUT_DIR
from const import SYNERGY_FILE, CELL_FEAT_FILE, CELL2ID_FILE, OUTPUT_DIR, DRUGNAME_2_DRUGBANKID_FILE

def eval_model(model, optimizer, loss_func, train_data, test_data,
@@ -30,15 +30,16 @@ def eval_model(model, optimizer, loss_func, train_data, test_data,


def step_batch(model, batch, loss_func, gpu_id=None, train=True):
drug1_feats, drug2_feats, cell_feats, y_true = batch
drug1_id, drug2_id, cell_feat, y_true = batch
if gpu_id is not None:
drug1_feats, drug2_feats, cell_feats, y_true = drug1_feats.cuda(gpu_id), drug2_feats.cuda(gpu_id), \
cell_feats.cuda(gpu_id), y_true.cuda(gpu_id)
# drug1_feats, drug2_feats, cell_feats, y_true = drug1_feats.cuda(gpu_id), drug2_feats.cuda(gpu_id), \
# cell_feats.cuda(gpu_id), y_true.cuda(gpu_id)
pass
if train:
y_pred = model(drug1_feats, drug2_feats, cell_feats)
y_pred = model(drug1_id, drug2_id, cell_feat)
else:
yp1 = model(drug1_feats, drug2_feats, cell_feats)
yp2 = model(drug2_feats, drug1_feats, cell_feats)
yp1 = model(drug1_id, drug2_id, cell_feat)
yp2 = model(drug2_id, drug1_id, cell_feat)
y_pred = (yp1 + yp2) / 2
loss = loss_func(y_pred, y_true)
return loss
@@ -93,7 +94,8 @@ def train_model(model, optimizer, loss_func, train_loader, valid_loader, n_epoch

def create_model(data, hidden_size, gpu_id=None):
# TODO: use our own MLP model
model = MLP(data.cell_feat_len() + 2 * data.drug_feat_len(), hidden_size)
# get 256
model = MLP(data.cell_feat_len() + 2 * 256, hidden_size)
if gpu_id is not None:
model = model.cuda(gpu_id)
return model
@@ -126,9 +128,9 @@ def cv(args, out_dir):
for valid_fold in outer_trn_folds:
inner_trn_folds = [x for x in outer_trn_folds if x != valid_fold]
valid_folds = [valid_fold]
train_data = FastSynergyDataset(DRUG2ID_FILE, CELL2ID_FILE, DRUG_FEAT_FILE, CELL_FEAT_FILE,
train_data = FastSynergyDataset(DRUGNAME_2_DRUGBANKID_FILE, CELL2ID_FILE, CELL_FEAT_FILE,
SYNERGY_FILE, use_folds=inner_trn_folds)
valid_data = FastSynergyDataset(DRUG2ID_FILE, CELL2ID_FILE, DRUG_FEAT_FILE, CELL_FEAT_FILE,
valid_data = FastSynergyDataset(DRUGNAME_2_DRUGBANKID_FILE, CELL2ID_FILE, CELL_FEAT_FILE,
SYNERGY_FILE, use_folds=valid_folds, train=False)
train_loader = FastTensorDataLoader(*train_data.tensor_samples(), batch_size=args.batch,
shuffle=True)
@@ -150,9 +152,9 @@ def cv(args, out_dir):
time.sleep(10)
min_ls, min_idx = arg_min(losses)
best_hs, best_lr = param[min_idx]
train_data = FastSynergyDataset(DRUG2ID_FILE, CELL2ID_FILE, DRUG_FEAT_FILE, CELL_FEAT_FILE,
train_data = FastSynergyDataset(DRUGNAME_2_DRUGBANKID_FILE, CELL2ID_FILE, CELL_FEAT_FILE,
SYNERGY_FILE, use_folds=outer_trn_folds)
test_data = FastSynergyDataset(DRUG2ID_FILE, CELL2ID_FILE, DRUG_FEAT_FILE, CELL_FEAT_FILE,
test_data = FastSynergyDataset(DRUGNAME_2_DRUGBANKID_FILE, CELL2ID_FILE, CELL_FEAT_FILE,
SYNERGY_FILE, use_folds=[test_fold], train=False)
model = create_model(train_data, best_hs, gpu_id)
optimizer = torch.optim.Adam(model.parameters(), lr=best_lr)

+ 19
- 14
predictor/model/datasets.py View File

@@ -4,7 +4,7 @@ import torch
import random

from torch.utils.data import Dataset
from .utils import read_map
from .utils import read_map, get_index_by_name

class FastTensorDataLoader:
"""
@@ -55,11 +55,10 @@ class FastTensorDataLoader:

class FastSynergyDataset(Dataset):

def __init__(self, drug2id_file, cell2id_file, drug_feat_file, cell_feat_file, synergy_score_file, use_folds,
def __init__(self, drugname2drugbankid_file, cell2id_file, cell_feat_file, synergy_score_file, use_folds,
train=True):
self.drug2id = read_map(drug2id_file)
self.drug2id = read_map(drugname2drugbankid_file, keep_str = True)
self.cell2id = read_map(cell2id_file)
self.drug_feat = np.load(drug_feat_file)
self.cell_feat = np.load(cell_feat_file)
self.samples = []
self.raw_samples = []
@@ -72,25 +71,29 @@ class FastSynergyDataset(Dataset):
drug1, drug2, cellname, score, fold = line.rstrip().split('\t')
if drug1 in valid_drugs and drug2 in valid_drugs and cellname in valid_cells:
if int(fold) in use_folds:
drug1_id = get_index_by_name(drug1)
drug2_id = get_index_by_name(drug2)
sample = [
# TODO: specify drug_feat
torch.from_numpy(self.drug_feat[self.drug2id[drug1]]).float(),
torch.from_numpy(self.drug_feat[self.drug2id[drug2]]).float(),
# TODO: specify drug_feat
# drug1_feat + drug2_feat + cell_feat + score
torch.IntTensor([drug1_id]),
torch.IntTensor([drug2_id]),
torch.from_numpy(self.cell_feat[self.cell2id[cellname]]).float(),
torch.FloatTensor([float(score)]),
]
# print(sample)
self.samples.append(sample)
raw_sample = [self.drug2id[drug1], self.drug2id[drug2], self.cell2id[cellname], score]
raw_sample = [drug1_id, drug2_id, self.cell2id[cellname], score]
self.raw_samples.append(raw_sample)
if train:
sample = [
torch.from_numpy(self.drug_feat[self.drug2id[drug2]]).float(),
torch.from_numpy(self.drug_feat[self.drug2id[drug1]]).float(),
torch.IntTensor([drug2_id]),
torch.IntTensor([drug1_id]),
torch.from_numpy(self.cell_feat[self.cell2id[cellname]]).float(),
torch.FloatTensor([float(score)]),
]
self.samples.append(sample)
raw_sample = [self.drug2id[drug2], self.drug2id[drug1], self.cell2id[cellname], score]
raw_sample = [drug2_id, drug1_id, self.cell2id[cellname], score]
self.raw_samples.append(raw_sample)

def __len__(self):
@@ -99,15 +102,17 @@ class FastSynergyDataset(Dataset):
def __getitem__(self, item):
return self.samples[item]

def drug_feat_len(self):
return self.drug_feat.shape[-1]

def cell_feat_len(self):
return self.cell_feat.shape[-1]

def tensor_samples(self, indices=None):
if indices is None:
indices = list(range(len(self)))
# print('-----------------------------')
# print(self.samples)
# print('-----------------')
print(self.samples[0])
print(self.samples[i][0] and self.samples[i][1] for i in indices)
d1 = torch.cat([torch.unsqueeze(self.samples[i][0], 0) for i in indices], dim=0)
d2 = torch.cat([torch.unsqueeze(self.samples[i][1], 0) for i in indices], dim=0)
c = torch.cat([torch.unsqueeze(self.samples[i][2], 0) for i in indices], dim=0)

+ 27
- 6
predictor/model/models.py View File

@@ -1,17 +1,36 @@
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
import sys


PROJ_DIR = os.path.dirname(os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..')))

sys.path.insert(0, PROJ_DIR)
from drug.models import GCN
from drug.datasets import DDInteractionDataset


class Connector(nn.Module):
def __init(self):
def __init__(self):
super(Connector, self).__init__()

#GCN
self.ddiDataset = DDInteractionDataset()
self.gcn = GCN(self.ddiDataset.num_features, self.ddiDataset.num_features // 2)
#Cell line features
# np.load('cell_feat.npy')

def forward(self, drug1_idx, drug2_idx, cell_feat):
x = self.ddiDataset.get().x
edge_index = self.ddiDataset.get().edge_index
x = self.gcn(x, edge_index)
drug1_feat = x[drug1_idx]
drug2_feat = x[drug2_idx]
feat = torch.cat([drug1_feat, drug2_feat, cell_feat], 1)

def forward(self):
pass
return feat


class MLP(nn.Module):
@@ -26,9 +45,11 @@ class MLP(nn.Module):
nn.BatchNorm1d(hidden_size // 2),
nn.Linear(hidden_size // 2, 1)
)

self.connector = Connector()
def forward(self, drug1_feat: torch.Tensor, drug2_feat: torch.Tensor, cell_feat: torch.Tensor):
feat = torch.cat([drug1_feat, drug2_feat, cell_feat], 1)
def forward(self, drug1_idx, drug2_idx, cell_feat): # prev input: self, drug1_feat: torch.Tensor, drug2_feat: torch.Tensor, cell_feat: torch.Tensor
feat = self.connector(drug1_idx, drug2_idx, cell_feat)
out = self.layers(feat)
return out


+ 14
- 5
predictor/model/utils.py View File

@@ -48,13 +48,17 @@ def save_args(args, save_to: str):
json.dump(args_dict, f, indent=2)


def read_map(map_file):
def read_map(map_file, keep_str = False):
d = {}
print(map_file)
with open(map_file, 'r') as f:
f.readline()
for line in f:
k, v = line.rstrip().split('\t')
d[k] = int(v)
k, v = line.rstrip().split()
if keep_str:
d[k] = v
else:
d[k] = int(v)
return d


@@ -80,12 +84,17 @@ def get_index_by_name(drug_name):
project_path = os.path.dirname(os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..')))
drugname2drugbankid_file = os.path.join(project_path, 'drug/data/drugname2drugbankid.tsv')
drug_name2drugbank_id_df = pd.read_csv(drugname2drugbankid_file , sep='\t')
drug_name2drugbank_id_df = pd.read_csv(drugname2drugbankid_file, sep='\s+')

drug_bank_id = drug_name2drugbank_id_df[drug_name2drugbank_id_df['drug_name'] == drug_name].drug_bank_id.item()

drug2id_file = os.path.join(project_path, 'drug/data/DDI/DrugBank/raw/', 'drug2id.tsv')
drug2id_df = pd.read_csv(drug2id_file , sep='\t')
drug_index = drug2id_df[drug2id_df['DrugBank_id'] == drug_bank_id].node_index.item()
row = drug2id_df[drug2id_df['DrugBank_id'] == drug_bank_id]
if row.empty:
drug_index = -1
else:
drug_index = row.node_index.item()

return drug_index

Loading…
Cancel
Save