Browse Source

integrate different parts for the first run

main
MahsaYazdani 2 years ago
parent
commit
ef6cb9f74d

+ 3
- 1
.gitignore View File

drug/data/DDI/SNAP Stanford/ChCh-Miner_durgbank-chem-chem.tsv drug/data/DDI/SNAP Stanford/ChCh-Miner_durgbank-chem-chem.tsv
cell/data/DTI/SNAP Stanford/ChG-Miner_miner-chem-gene.tsv cell/data/DTI/SNAP Stanford/ChG-Miner_miner-chem-gene.tsv
cell/data/DTI/SNAP Stanford/ChG-Miner_miner-chem-gene.tsv.gz cell/data/DTI/SNAP Stanford/ChG-Miner_miner-chem-gene.tsv.gz
drug/data/Smiles/drugbank_all_structure_links.csv.zip
drug/data/Smiles/drugbank_all_structure_links.csv.zip
*.pyc
predictor/test.py

BIN
drug/data/DDI/DrugBank/processed/ddi_graph_dataset.pt View File


+ 1
- 1
drug/data/DDI/DrugBank/raw/drug2id.tsv View File

DB06614 5911 DB06614 5911
DB09047 5912 DB09047 5912
DB11074 5913 DB11074 5913
DB00878 5914
DB00878 5914

+ 37
- 38
drug/data/drugname2drugbankid.tsv View File

drug_name drug_bank_id
5-FU DB00544
ABT-888 DB07232
AZD1775 DB11740
BEZ-235 DB11651
BORTEZOMIB DB00188
CARBOPLATIN DB00958
CYCLOPHOSPHAMIDE DB00531
DASATINIB DB01254
DEXAMETHASONE DB01234
DINACICLIB DB12021
DOXORUBICIN DB00997
ERLOTINIB DB00530
ETOPOSIDE DB00773
GELDANAMYCIN DB02424
drug_name drug_bank_id
5-FU DB00544
ABT-888 DB07232
AZD1775 DB11740
BEZ-235 DB11651
BORTEZOMIB DB00188
CARBOPLATIN DB00958
CYCLOPHOSPHAMIDE DB00531
DASATINIB DB01254
DEXAMETHASONE DB01234
DINACICLIB DB12021
DOXORUBICIN DB00997
ERLOTINIB DB00530
ETOPOSIDE DB00773
GELDANAMYCIN DB02424
GEMCITABINE DB00441 GEMCITABINE DB00441
L778123 DB07227 L778123 DB07227
LAPATINIB DB01259
METFORMIN DB00331
METHOTREXATE DB00563
MITOMYCINE DB00305
MK-2206 DB16828
MK-4541 DB17016
MK-4827 DB11793
MK-5108 DB12556
MK-8669 DB06233
MK-8776 DB11899
MRK-003 DB17015
OXALIPLATIN DB00526
PACLITAXEL DB01229
PD325901 DB07101
SN-38 DB05482
SORAFENIB DB00398
SUNITINIB DB01268
TEMOZOLOMIDE DB00853
TOPOTECAN DB01030
VINBLASTINE DB00570
VINORELBINE DB00361
ZOLINZA DB02546

LAPATINIB DB01259
METFORMIN DB00331
METHOTREXATE DB00563
MITOMYCINE DB00305
MK-2206 DB16828
MK-4541 DB17016
MK-4827 DB11793
MK-5108 DB12556
MK-8669 DB06233
MK-8776 DB11899
MRK-003 DB17015
OXALIPLATIN DB00526
PACLITAXEL DB01229
PD325901 DB07101
SN-38 DB05482
SORAFENIB DB00398
SUNITINIB DB01268
TEMOZOLOMIDE DB00853
TOPOTECAN DB01030
VINBLASTINE DB00570
VINORELBINE DB00361
ZOLINZA DB02546

+ 8
- 5
drug/datasets.py View File





class DDInteractionDataset(Dataset): class DDInteractionDataset(Dataset):
def __init__(self, root, transform=None, pre_transform=None, pre_filter=None):
def __init__(self, root = "drug/data/", transform=None, pre_transform=None, pre_filter=None):
super(DDInteractionDataset, self).__init__(root, transform, pre_transform, pre_filter) super(DDInteractionDataset, self).__init__(root, transform, pre_transform, pre_filter)


@property @property
def generate_rand_fp(self): def generate_rand_fp(self):
number = random.getrandbits(256) number = random.getrandbits(256)

# Convert the number to binary # Convert the number to binary
binary_string = format(number, '0b')
binary_string = '{0:0256b}'.format(number)
random_fp = [x for x in binary_string] random_fp = [x for x in binary_string]
random_fp = list(map(int, random_fp)) random_fp = list(map(int, random_fp))
return random_fp return random_fp
drug_fp_df = pd.read_csv(drug_fp_path) drug_fp_df = pd.read_csv(drug_fp_path)


node_features = list() node_features = list()
node_ids = list()
for i in range(num_nodes): for i in range(num_nodes):
drugbankid = self.find_drugBank_id(i) drugbankid = self.find_drugBank_id(i)
fp = drug_fp_df.loc[drug_fp_df['DrugBank_id'] == drugbankid] fp = drug_fp_df.loc[drug_fp_df['DrugBank_id'] == drugbankid]
fp = list(fp.to_numpy()[0,1:]) fp = list(fp.to_numpy()[0,1:])


node_features.append(fp) node_features.append(fp)
node_ids.append(drugbankid)


self.num_features = len(node_features[0]) self.num_features = len(node_features[0])


return node_features
return node_ids, node_features


def process(self): def process(self):
path = osp.join(self.raw_dir, self.raw_file_names[0]) path = osp.join(self.raw_dir, self.raw_file_names[0])
ddi = pd.read_csv(path , sep='\t') ddi = pd.read_csv(path , sep='\t')
edge_index = torch.tensor([ddi['drug1_idx'],ddi['drug2_idx']], dtype=torch.long) edge_index = torch.tensor([ddi['drug1_idx'],ddi['drug2_idx']], dtype=torch.long)
num_nodes = ddi['drug1_idx'].max() + 1 num_nodes = ddi['drug1_idx'].max() + 1
node_features = self.read_node_features(num_nodes)
# TODO: check why sometimes node_features len is less than 256
node_ids, node_features = self.read_node_features(num_nodes)
node_features = torch.tensor(node_features, dtype=torch.int)
print("node features nrow and ncol: ",len(node_features),len(node_features[0])) print("node features nrow and ncol: ",len(node_features),len(node_features[0]))


# --------------------------------------------------------------- # ---------------------------------------------------------------

+ 8
- 3
predictor/const.py View File

import os import os
# e:\Me\Master\BCB\Thesis\DrugCombModel
# PROJ_DIR = os.path.dirname(os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..')))


PROJ_DIR = os.path.dirname(os.path.abspath(os.path.dirname(__file__)))
# E:\Me\Master\BCB\Thesis\DrugCombModel\DrugCombinationPredeiction
PROJ_DIR = os.path.dirname(os.path.abspath(os.path.dirname( __file__ )))
DRUG_DIR = os.path.join(PROJ_DIR, 'drug')
SUB_PROJ_DIR = os.path.join(PROJ_DIR, 'predictor') SUB_PROJ_DIR = os.path.join(PROJ_DIR, 'predictor')
DATA_DIR = os.path.join(SUB_PROJ_DIR, 'data') DATA_DIR = os.path.join(SUB_PROJ_DIR, 'data')
DRUG_DATA_DIR = os.path.join(PROJ_DIR, 'drug', 'data') DRUG_DATA_DIR = os.path.join(PROJ_DIR, 'drug', 'data')


SYNERGY_FILE = os.path.join(DATA_DIR, 'synergy.tsv') SYNERGY_FILE = os.path.join(DATA_DIR, 'synergy.tsv')


DRUG_FEAT_FILE = os.path.join(DRUG_DATA_DIR, 'drug_feat.npy')
DRUG2ID_FILE = os.path.join(DRUG_DATA_DIR, 'drug2id.tsv')
# DRUG_FEAT_FILE = os.path.join(DRUG_DATA_DIR, 'drug_feat.npy')
DRUGN2ID_FILE = os.path.join(DRUG_DATA_DIR, 'DDI\DrugBank\\raw\drug2id.tsv')
DRUGNAME_2_DRUGBANKID_FILE = os.path.join(DRUG_DATA_DIR, 'drugname2drugbankid.tsv')
CELL_FEAT_FILE = os.path.join(CELL_DATA_DIR, 'cell_feat.npy') CELL_FEAT_FILE = os.path.join(CELL_DATA_DIR, 'cell_feat.npy')
CELL2ID_FILE = os.path.join(CELL_DATA_DIR, 'cell2id.tsv') CELL2ID_FILE = os.path.join(CELL_DATA_DIR, 'cell2id.tsv')



+ 14
- 12
predictor/cross_validation.py View File

from model.datasets import FastSynergyDataset, FastTensorDataLoader from model.datasets import FastSynergyDataset, FastTensorDataLoader
from model.models import MLP from model.models import MLP
from model.utils import save_args, save_best_model, find_best_model, arg_min, random_split_indices, calc_stat, conf_inv from model.utils import save_args, save_best_model, find_best_model, arg_min, random_split_indices, calc_stat, conf_inv
from const import SYNERGY_FILE, DRUG2ID_FILE, DRUG_FEAT_FILE, CELL_FEAT_FILE, CELL2ID_FILE, OUTPUT_DIR
from const import SYNERGY_FILE, CELL_FEAT_FILE, CELL2ID_FILE, OUTPUT_DIR, DRUGNAME_2_DRUGBANKID_FILE


def eval_model(model, optimizer, loss_func, train_data, test_data, def eval_model(model, optimizer, loss_func, train_data, test_data,




def step_batch(model, batch, loss_func, gpu_id=None, train=True): def step_batch(model, batch, loss_func, gpu_id=None, train=True):
drug1_feats, drug2_feats, cell_feats, y_true = batch
drug1_id, drug2_id, cell_feat, y_true = batch
if gpu_id is not None: if gpu_id is not None:
drug1_feats, drug2_feats, cell_feats, y_true = drug1_feats.cuda(gpu_id), drug2_feats.cuda(gpu_id), \
cell_feats.cuda(gpu_id), y_true.cuda(gpu_id)
# drug1_feats, drug2_feats, cell_feats, y_true = drug1_feats.cuda(gpu_id), drug2_feats.cuda(gpu_id), \
# cell_feats.cuda(gpu_id), y_true.cuda(gpu_id)
pass
if train: if train:
y_pred = model(drug1_feats, drug2_feats, cell_feats)
y_pred = model(drug1_id, drug2_id, cell_feat)
else: else:
yp1 = model(drug1_feats, drug2_feats, cell_feats)
yp2 = model(drug2_feats, drug1_feats, cell_feats)
yp1 = model(drug1_id, drug2_id, cell_feat)
yp2 = model(drug2_id, drug1_id, cell_feat)
y_pred = (yp1 + yp2) / 2 y_pred = (yp1 + yp2) / 2
loss = loss_func(y_pred, y_true) loss = loss_func(y_pred, y_true)
return loss return loss


def create_model(data, hidden_size, gpu_id=None): def create_model(data, hidden_size, gpu_id=None):
# TODO: use our own MLP model # TODO: use our own MLP model
model = MLP(data.cell_feat_len() + 2 * data.drug_feat_len(), hidden_size)
# get 256
model = MLP(data.cell_feat_len() + 2 * 256, hidden_size)
if gpu_id is not None: if gpu_id is not None:
model = model.cuda(gpu_id) model = model.cuda(gpu_id)
return model return model
for valid_fold in outer_trn_folds: for valid_fold in outer_trn_folds:
inner_trn_folds = [x for x in outer_trn_folds if x != valid_fold] inner_trn_folds = [x for x in outer_trn_folds if x != valid_fold]
valid_folds = [valid_fold] valid_folds = [valid_fold]
train_data = FastSynergyDataset(DRUG2ID_FILE, CELL2ID_FILE, DRUG_FEAT_FILE, CELL_FEAT_FILE,
train_data = FastSynergyDataset(DRUGNAME_2_DRUGBANKID_FILE, CELL2ID_FILE, CELL_FEAT_FILE,
SYNERGY_FILE, use_folds=inner_trn_folds) SYNERGY_FILE, use_folds=inner_trn_folds)
valid_data = FastSynergyDataset(DRUG2ID_FILE, CELL2ID_FILE, DRUG_FEAT_FILE, CELL_FEAT_FILE,
valid_data = FastSynergyDataset(DRUGNAME_2_DRUGBANKID_FILE, CELL2ID_FILE, CELL_FEAT_FILE,
SYNERGY_FILE, use_folds=valid_folds, train=False) SYNERGY_FILE, use_folds=valid_folds, train=False)
train_loader = FastTensorDataLoader(*train_data.tensor_samples(), batch_size=args.batch, train_loader = FastTensorDataLoader(*train_data.tensor_samples(), batch_size=args.batch,
shuffle=True) shuffle=True)
time.sleep(10) time.sleep(10)
min_ls, min_idx = arg_min(losses) min_ls, min_idx = arg_min(losses)
best_hs, best_lr = param[min_idx] best_hs, best_lr = param[min_idx]
train_data = FastSynergyDataset(DRUG2ID_FILE, CELL2ID_FILE, DRUG_FEAT_FILE, CELL_FEAT_FILE,
train_data = FastSynergyDataset(DRUGNAME_2_DRUGBANKID_FILE, CELL2ID_FILE, CELL_FEAT_FILE,
SYNERGY_FILE, use_folds=outer_trn_folds) SYNERGY_FILE, use_folds=outer_trn_folds)
test_data = FastSynergyDataset(DRUG2ID_FILE, CELL2ID_FILE, DRUG_FEAT_FILE, CELL_FEAT_FILE,
test_data = FastSynergyDataset(DRUGNAME_2_DRUGBANKID_FILE, CELL2ID_FILE, CELL_FEAT_FILE,
SYNERGY_FILE, use_folds=[test_fold], train=False) SYNERGY_FILE, use_folds=[test_fold], train=False)
model = create_model(train_data, best_hs, gpu_id) model = create_model(train_data, best_hs, gpu_id)
optimizer = torch.optim.Adam(model.parameters(), lr=best_lr) optimizer = torch.optim.Adam(model.parameters(), lr=best_lr)

+ 19
- 14
predictor/model/datasets.py View File

import random import random


from torch.utils.data import Dataset from torch.utils.data import Dataset
from .utils import read_map
from .utils import read_map, get_index_by_name


class FastTensorDataLoader: class FastTensorDataLoader:
""" """


class FastSynergyDataset(Dataset): class FastSynergyDataset(Dataset):


def __init__(self, drug2id_file, cell2id_file, drug_feat_file, cell_feat_file, synergy_score_file, use_folds,
def __init__(self, drugname2drugbankid_file, cell2id_file, cell_feat_file, synergy_score_file, use_folds,
train=True): train=True):
self.drug2id = read_map(drug2id_file)
self.drug2id = read_map(drugname2drugbankid_file, keep_str = True)
self.cell2id = read_map(cell2id_file) self.cell2id = read_map(cell2id_file)
self.drug_feat = np.load(drug_feat_file)
self.cell_feat = np.load(cell_feat_file) self.cell_feat = np.load(cell_feat_file)
self.samples = [] self.samples = []
self.raw_samples = [] self.raw_samples = []
drug1, drug2, cellname, score, fold = line.rstrip().split('\t') drug1, drug2, cellname, score, fold = line.rstrip().split('\t')
if drug1 in valid_drugs and drug2 in valid_drugs and cellname in valid_cells: if drug1 in valid_drugs and drug2 in valid_drugs and cellname in valid_cells:
if int(fold) in use_folds: if int(fold) in use_folds:
drug1_id = get_index_by_name(drug1)
drug2_id = get_index_by_name(drug2)
sample = [ sample = [
# TODO: specify drug_feat
torch.from_numpy(self.drug_feat[self.drug2id[drug1]]).float(),
torch.from_numpy(self.drug_feat[self.drug2id[drug2]]).float(),
# TODO: specify drug_feat
# drug1_feat + drug2_feat + cell_feat + score
torch.IntTensor([drug1_id]),
torch.IntTensor([drug2_id]),
torch.from_numpy(self.cell_feat[self.cell2id[cellname]]).float(), torch.from_numpy(self.cell_feat[self.cell2id[cellname]]).float(),
torch.FloatTensor([float(score)]), torch.FloatTensor([float(score)]),
] ]
# print(sample)
self.samples.append(sample) self.samples.append(sample)
raw_sample = [self.drug2id[drug1], self.drug2id[drug2], self.cell2id[cellname], score]
raw_sample = [drug1_id, drug2_id, self.cell2id[cellname], score]
self.raw_samples.append(raw_sample) self.raw_samples.append(raw_sample)
if train: if train:
sample = [ sample = [
torch.from_numpy(self.drug_feat[self.drug2id[drug2]]).float(),
torch.from_numpy(self.drug_feat[self.drug2id[drug1]]).float(),
torch.IntTensor([drug2_id]),
torch.IntTensor([drug1_id]),
torch.from_numpy(self.cell_feat[self.cell2id[cellname]]).float(), torch.from_numpy(self.cell_feat[self.cell2id[cellname]]).float(),
torch.FloatTensor([float(score)]), torch.FloatTensor([float(score)]),
] ]
self.samples.append(sample) self.samples.append(sample)
raw_sample = [self.drug2id[drug2], self.drug2id[drug1], self.cell2id[cellname], score]
raw_sample = [drug2_id, drug1_id, self.cell2id[cellname], score]
self.raw_samples.append(raw_sample) self.raw_samples.append(raw_sample)


def __len__(self): def __len__(self):
def __getitem__(self, item): def __getitem__(self, item):
return self.samples[item] return self.samples[item]


def drug_feat_len(self):
return self.drug_feat.shape[-1]

def cell_feat_len(self): def cell_feat_len(self):
return self.cell_feat.shape[-1] return self.cell_feat.shape[-1]


def tensor_samples(self, indices=None): def tensor_samples(self, indices=None):
if indices is None: if indices is None:
indices = list(range(len(self))) indices = list(range(len(self)))
# print('-----------------------------')
# print(self.samples)
# print('-----------------')
print(self.samples[0])
print(self.samples[i][0] and self.samples[i][1] for i in indices)
d1 = torch.cat([torch.unsqueeze(self.samples[i][0], 0) for i in indices], dim=0) d1 = torch.cat([torch.unsqueeze(self.samples[i][0], 0) for i in indices], dim=0)
d2 = torch.cat([torch.unsqueeze(self.samples[i][1], 0) for i in indices], dim=0) d2 = torch.cat([torch.unsqueeze(self.samples[i][1], 0) for i in indices], dim=0)
c = torch.cat([torch.unsqueeze(self.samples[i][2], 0) for i in indices], dim=0) c = torch.cat([torch.unsqueeze(self.samples[i][2], 0) for i in indices], dim=0)

+ 27
- 6
predictor/model/models.py View File

import torch import torch
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
import os
import sys


PROJ_DIR = os.path.dirname(os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..')))

sys.path.insert(0, PROJ_DIR)
from drug.models import GCN
from drug.datasets import DDInteractionDataset




class Connector(nn.Module): class Connector(nn.Module):
def __init(self):
def __init__(self):
super(Connector, self).__init__() super(Connector, self).__init__()


#GCN
self.ddiDataset = DDInteractionDataset()
self.gcn = GCN(self.ddiDataset.num_features, self.ddiDataset.num_features // 2)
#Cell line features #Cell line features
# np.load('cell_feat.npy')

def forward(self, drug1_idx, drug2_idx, cell_feat):
x = self.ddiDataset.get().x
edge_index = self.ddiDataset.get().edge_index
x = self.gcn(x, edge_index)
drug1_feat = x[drug1_idx]
drug2_feat = x[drug2_idx]
feat = torch.cat([drug1_feat, drug2_feat, cell_feat], 1)


def forward(self):
pass
return feat




class MLP(nn.Module): class MLP(nn.Module):
nn.BatchNorm1d(hidden_size // 2), nn.BatchNorm1d(hidden_size // 2),
nn.Linear(hidden_size // 2, 1) nn.Linear(hidden_size // 2, 1)
) )

self.connector = Connector()
def forward(self, drug1_feat: torch.Tensor, drug2_feat: torch.Tensor, cell_feat: torch.Tensor):
feat = torch.cat([drug1_feat, drug2_feat, cell_feat], 1)
def forward(self, drug1_idx, drug2_idx, cell_feat): # prev input: self, drug1_feat: torch.Tensor, drug2_feat: torch.Tensor, cell_feat: torch.Tensor
feat = self.connector(drug1_idx, drug2_idx, cell_feat)
out = self.layers(feat) out = self.layers(feat)
return out return out



+ 14
- 5
predictor/model/utils.py View File

json.dump(args_dict, f, indent=2) json.dump(args_dict, f, indent=2)




def read_map(map_file):
def read_map(map_file, keep_str = False):
d = {} d = {}
print(map_file)
with open(map_file, 'r') as f: with open(map_file, 'r') as f:
f.readline() f.readline()
for line in f: for line in f:
k, v = line.rstrip().split('\t')
d[k] = int(v)
k, v = line.rstrip().split()
if keep_str:
d[k] = v
else:
d[k] = int(v)
return d return d




project_path = os.path.dirname(os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..'))) project_path = os.path.dirname(os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..')))
drugname2drugbankid_file = os.path.join(project_path, 'drug/data/drugname2drugbankid.tsv') drugname2drugbankid_file = os.path.join(project_path, 'drug/data/drugname2drugbankid.tsv')
drug_name2drugbank_id_df = pd.read_csv(drugname2drugbankid_file , sep='\t')
drug_name2drugbank_id_df = pd.read_csv(drugname2drugbankid_file, sep='\s+')


drug_bank_id = drug_name2drugbank_id_df[drug_name2drugbank_id_df['drug_name'] == drug_name].drug_bank_id.item() drug_bank_id = drug_name2drugbank_id_df[drug_name2drugbank_id_df['drug_name'] == drug_name].drug_bank_id.item()


drug2id_file = os.path.join(project_path, 'drug/data/DDI/DrugBank/raw/', 'drug2id.tsv') drug2id_file = os.path.join(project_path, 'drug/data/DDI/DrugBank/raw/', 'drug2id.tsv')
drug2id_df = pd.read_csv(drug2id_file , sep='\t') drug2id_df = pd.read_csv(drug2id_file , sep='\t')
drug_index = drug2id_df[drug2id_df['DrugBank_id'] == drug_bank_id].node_index.item()
row = drug2id_df[drug2id_df['DrugBank_id'] == drug_bank_id]
if row.empty:
drug_index = -1
else:
drug_index = row.node_index.item()


return drug_index return drug_index

Loading…
Cancel
Save