| @@ -1,2 +1,44 @@ | |||
| # DeepTraCDR | |||
| # DeepTraCDR: Prediction Cancer Drug Response using multimodal deep learning with Transformers | |||
| <!-- | |||
| ## Abstract | |||
| <div align="justify"> | |||
| </div> --> | |||
| <!-- ## Method | |||
| <img width="810" alt="image" src="https://github.com/akianfar/Deep-CBN/blob/main/assest/Artboard%202.jpg"> | |||
| <img width="810" alt="image" src="https://github.com/akianfar/Deep-CBN/blob/main/assest/Artboard%203.jpg"> --> | |||
| ### Requirements | |||
| To run this project, you need to install the required dependencies first. Execute the following command in your terminal or command prompt: | |||
| ```bash | |||
| pip install -r requirements.txt | |||
| ``` | |||
| ## DeepTraCDR Model Overview | |||
| DeepTraCDR is a modular model consisting of **Common Modules** and **Experimental Modules**. | |||
| ### Common Modules | |||
| - **Data**: Includes datasets for model training and evaluation: | |||
| - **GDSC**: Contains `cell_drug.csv` (log IC50 matrix), `cell_drug_binary.csv` (binary matrix), `cell_exprs.csv` (gene expression), `drug_feature.csv` (drug fingerprints), `null_mask.csv` (null values), and `threshold.csv` (sensitivity threshold). | |||
| - **CCLE**: Similar to GDSC with `cell_drug.csv`, `cell_drug_binary.csv`, `cell_exprs.csv`, and `drug_feature.csv`. | |||
| - **PDX**: Includes `pdx_response.csv` (binary patient-drug matrix), `pdx_exprs.csv` (gene expression), `pdx_null_mask.csv` (null values), and `drug_feature.csv`. | |||
| - **TCGA**: Contains `patient_drug_binary.csv` (binary matrix), `tcga_exprs.csv` (gene expression), `tcga_null_mask.csv` (null values), and `drug_feature.csv`. | |||
| ### Experimental Modules | |||
| The experimental modules are organized into the following directories, each containing a `main.py` script to run the respective experiment: | |||
| - **`case_study`**: Contains scripts for case study experiments (e.g., `main_case_study.py`). | |||
| - **`Scenario1`**: Includes experiments for random clearing cross-validation (`Random`) and regression (`Regression`). | |||
| - **`Scenario2`**: Includes experiments for single row/column clearing (`new`) and targeted drug experiments (`Target`). | |||
| - **`Scenario3`**: Includes external validation experiments from in vitro to in vivo (`External`). | |||
| Each `main.py` script outputs true and predicted test data values after multiple cross-validations. The `utils.py` file supports performance analysis with metrics like AUC, AUPRC, ACC, F1, and MCC. The model is built using PyTorch with CUDA support. | |||
| @@ -0,0 +1,163 @@ | |||
| import pandas as pd | |||
| import numpy as np | |||
| import scipy.sparse as sp | |||
| from utils import * | |||
| def load_data(args): | |||
| """ | |||
| Loads dataset based on the specified data type. | |||
| Args: | |||
| args: Object containing configuration parameters, including the dataset type. | |||
| Returns: | |||
| Tuple containing adjacency matrix, drug fingerprints, expression data, null mask, positive edge count, and updated args. | |||
| Raises: | |||
| NotImplementedError: If the specified dataset is not supported. | |||
| """ | |||
| if args.data == 'gdsc': | |||
| return _load_gdsc(args) | |||
| elif args.data == 'ccle': | |||
| return _load_ccle(args) | |||
| elif args.data == 'pdx': | |||
| return _load_pdx(args) | |||
| elif args.data == 'tcga': | |||
| return _load_tcga(args) | |||
| else: | |||
| raise NotImplementedError(f"Dataset {args.data} is not supported.") | |||
| def _load_gdsc(args): | |||
| """ | |||
| Loads GDSC dataset, including cell-drug response, drug fingerprints, gene expression, and null mask. | |||
| Args: | |||
| args: Configuration object to be updated with dataset-specific parameters. | |||
| Returns: | |||
| Tuple of response matrix, drug fingerprints, expression data, null mask, positive edge count, and updated args. | |||
| """ | |||
| args.alpha = 0.25 | |||
| args.layer_size = [512, 512] | |||
| # Load drug fingerprints | |||
| drug_fingerprints = [ | |||
| pd.read_csv("/media/external_16TB_1/ali_kianfar/Data/GDSC/ECFP6_fingerprints_GDSC.csv", index_col=0).values.astype(np.float32) | |||
| ] | |||
| # Load response, expression, and null mask data | |||
| res = pd.read_csv("/media/external_16TB_1/ali_kianfar/Data/GDSC/cell_drug_binary.csv", index_col=0).values.astype(np.float32) | |||
| exprs = pd.read_csv("/media/external_16TB_1/ali_kianfar/Data/GDSC/cell_gene/merged_file_GDSC.csv", index_col=0).values.astype(np.float32) | |||
| null_mask = pd.read_csv("/media/external_16TB_1/ali_kianfar/Data/GDSC/null_mask.csv", index_col=0).values.astype(np.float32) | |||
| pos_num = sp.coo_matrix(res).data.shape[0] | |||
| return res, drug_fingerprints, exprs, null_mask, pos_num, args | |||
| def _load_ccle(args): | |||
| """ | |||
| Loads CCLE dataset, including cell-drug response, drug fingerprints, gene expression, and null mask. | |||
| Args: | |||
| args: Configuration object to be updated with dataset-specific parameters. | |||
| Returns: | |||
| Tuple of response matrix, drug fingerprints, expression data, null mask, positive edge count, and updated args. | |||
| """ | |||
| args.alpha = 0.45 | |||
| args.layer_size = [512, 512] | |||
| # Load drug fingerprints | |||
| drug_fingerprints = [ | |||
| pd.read_csv("/media/external_16TB_1/ali_kianfar/Data/CCLE/ECFP6_fingerprints.csv", index_col=0).values.astype(np.float32) | |||
| ] | |||
| # Load response and expression data, initialize null mask | |||
| res = pd.read_csv("/media/external_16TB_1/ali_kianfar/Data/CCLE/cell_drug_binary.csv", index_col=0).values.astype(np.float32) | |||
| exprs = pd.read_csv("/media/external_16TB_1/ali_kianfar/Data/CCLE/merged_file.csv", index_col=0).values.astype(np.float32) | |||
| null_mask = np.zeros(res.shape, dtype=np.float32) | |||
| pos_num = sp.coo_matrix(res).data.shape[0] | |||
| return res, drug_fingerprints, exprs, null_mask, pos_num, args | |||
| def _load_pdx(args): | |||
| """ | |||
| Loads PDX dataset by merging GDSC and PDX data, aligning gene expression by common genes. | |||
| Args: | |||
| args: Configuration object to be updated with dataset-specific parameters. | |||
| Returns: | |||
| Tuple of merged response matrix, drug fingerprints, merged expression data, merged null mask, training row count, and updated args. | |||
| """ | |||
| args.alpha = 0.15 | |||
| args.layer_size = [1024, 1024] | |||
| # Load response matrices | |||
| gdsc_res = pd.read_csv("/media/external_16TB_1/ali_kianfar/Data/GDSC/cell_drug_binary.csv", index_col=0).values.astype(np.float32) | |||
| pdx_res = pd.read_csv("/media/external_16TB_1/ali_kianfar/Data/PDX/pdx_response.csv", index_col=0).values.astype(np.float32) | |||
| res = np.concatenate((gdsc_res, pdx_res), axis=0) | |||
| train_row = gdsc_res.shape[0] | |||
| # Load drug fingerprints | |||
| drug_finger = [ | |||
| pd.read_csv("/media/external_16TB_1/ali_kianfar/Data/GDSC/ECFP6_fingerprints_GDSC.csv", index_col=0).values.astype(np.float32) | |||
| ] | |||
| # Load and align gene expression data | |||
| gdsc_exprs_df = pd.read_csv("/media/external_16TB_1/ali_kianfar/Data/GDSC/cell_gene/merged_file_GDSC.csv", index_col=0) | |||
| pdx_exprs_df = pd.read_csv("/media/external_16TB_1/ali_kianfar/Data/PDX/pdx_exprs.csv", index_col=0) | |||
| common_genes = gdsc_exprs_df.columns.intersection(pdx_exprs_df.columns) | |||
| gdsc_exprs_filtered = gdsc_exprs_df[common_genes].values.astype(np.float32) | |||
| pdx_exprs_filtered = pdx_exprs_df[common_genes].values.astype(np.float32) | |||
| exprs = np.concatenate((gdsc_exprs_filtered, pdx_exprs_filtered), axis=0) | |||
| # Load and merge null masks | |||
| gdsc_null_mask = pd.read_csv("/media/external_16TB_1/ali_kianfar/Data/GDSC/null_mask.csv", index_col=0).values.astype(np.float32) | |||
| pdx_null_mask = pd.read_csv("/media/external_16TB_1/ali_kianfar/Data/PDX/pdx_null_mask.csv", index_col=0).values.astype(np.float32) | |||
| null_mask = np.concatenate((gdsc_null_mask, pdx_null_mask), axis=0) | |||
| return res, drug_finger, exprs, null_mask, train_row, args | |||
| def _load_tcga(args): | |||
| """ | |||
| Loads TCGA dataset by merging GDSC and TCGA data, aligning gene expression by common genes. | |||
| Args: | |||
| args: Configuration object to be updated with dataset-specific parameters. | |||
| Returns: | |||
| Tuple of merged response matrix, drug fingerprints, merged expression data, merged null mask, training row count, and updated args. | |||
| """ | |||
| args.alpha = 0.1 | |||
| args.layer_size = [1024, 1024] | |||
| # Load response matrices | |||
| gdsc_res = pd.read_csv("/media/external_16TB_1/ali_kianfar/Data/GDSC/cell_drug_binary.csv", index_col=0).values.astype(np.float32) | |||
| tcga_res = pd.read_csv("/media/external_16TB_1/ali_kianfar/Data/TCGA/patient_drug_binary.csv", index_col=0).values.astype(np.float32) | |||
| res = np.concatenate((gdsc_res, tcga_res), axis=0) | |||
| train_row = gdsc_res.shape[0] | |||
| # Load drug fingerprints | |||
| drug_finger = [ | |||
| pd.read_csv("/media/external_16TB_1/ali_kianfar/Data/GDSC/ECFP6_fingerprints_GDSC.csv", index_col=0).values.astype(np.float32) | |||
| ] | |||
| # Load and align gene expression data | |||
| gdsc_exprs = pd.read_csv("/media/external_16TB_1/ali_kianfar/Data/GDSC/cell_gene/merged_file_GDSC.csv", index_col=0) | |||
| patient_gene = pd.read_csv("/media/external_16TB_1/ali_kianfar/Data/TCGA/tcga_gene_exprs.csv", index_col=0) | |||
| common_genes = gdsc_exprs.columns.intersection(patient_gene.columns) | |||
| gdsc_exprs_filtered = gdsc_exprs[common_genes].values.astype(np.float32) | |||
| tcga_exprs_filtered = patient_gene[common_genes].values.astype(np.float32) | |||
| exprs = np.concatenate((gdsc_exprs_filtered, tcga_exprs_filtered), axis=0) | |||
| # Load and merge null masks | |||
| gdsc_null_mask = pd.read_csv("/media/external_16TB_1/ali_kianfar/Data/GDSC/null_mask.csv", index_col=0).values.astype(np.float32) | |||
| tcga_null_mask = pd.read_csv("/media/external_16TB_1/ali_kianfar/Data/TCGA/null_mask.csv", index_col=0).values.astype(np.float32) | |||
| null_mask = np.concatenate((gdsc_null_mask, tcga_null_mask), axis=0) | |||
| return res, drug_finger, exprs, null_mask, train_row, args | |||
| @@ -0,0 +1,360 @@ | |||
| import torch | |||
| import numpy as np | |||
| import scipy.sparse as sp | |||
| from typing import Tuple, Optional | |||
| from utils import to_coo_matrix, to_tensor, mask | |||
| class RandomSampler: | |||
| """ | |||
| Samples edges from an adjacency matrix to create train/test sets. | |||
| Converts the training set into torch.Tensor format. | |||
| """ | |||
| def __init__( | |||
| self, | |||
| adj_mat_original: np.ndarray, | |||
| train_index: np.ndarray, | |||
| test_index: np.ndarray, | |||
| null_mask: np.ndarray | |||
| ) -> None: | |||
| self.adj_mat = to_coo_matrix(adj_mat_original) | |||
| self.train_index = train_index | |||
| self.test_index = test_index | |||
| self.null_mask = null_mask | |||
| # Sample positive edges | |||
| self.train_pos = self._sample_edges(train_index) | |||
| self.test_pos = self._sample_edges(test_index) | |||
| # Sample negative edges | |||
| self.train_neg, self.test_neg = self._sample_negative_edges() | |||
| # Create masks | |||
| self.train_mask = mask(self.train_pos, self.train_neg, dtype=int) | |||
| self.test_mask = mask(self.test_pos, self.test_neg, dtype=bool) | |||
| # Convert to tensors | |||
| self.train_data = to_tensor(self.train_pos) | |||
| self.test_data = to_tensor(self.test_pos) | |||
| def _sample_edges(self, index: np.ndarray) -> sp.coo_matrix: | |||
| """Samples edges from the adjacency matrix based on provided indices.""" | |||
| row = self.adj_mat.row[index] | |||
| col = self.adj_mat.col[index] | |||
| data = self.adj_mat.data[index] | |||
| return sp.coo_matrix( | |||
| (data, (row, col)), | |||
| shape=self.adj_mat.shape | |||
| ) | |||
| def _sample_negative_edges(self) -> Tuple[sp.coo_matrix, sp.coo_matrix]: | |||
| """ | |||
| Samples negative edges for training and testing. | |||
| Negative edges are those not present in the adjacency matrix. | |||
| """ | |||
| pos_adj_mat = self.null_mask + self.adj_mat.toarray() | |||
| neg_adj_mat = sp.coo_matrix(np.abs(pos_adj_mat - 1)) | |||
| all_row, all_col, all_data = neg_adj_mat.row, neg_adj_mat.col, neg_adj_mat.data | |||
| indices = np.arange(all_data.shape[0]) | |||
| # Sample negative test edges | |||
| test_n = self.test_index.shape[0] | |||
| test_neg_indices = np.random.choice(indices, test_n, replace=False) | |||
| test_row, test_col, test_data = ( | |||
| all_row[test_neg_indices], | |||
| all_col[test_neg_indices], | |||
| all_data[test_neg_indices] | |||
| ) | |||
| test_neg = sp.coo_matrix( | |||
| (test_data, (test_row, test_col)), | |||
| shape=self.adj_mat.shape | |||
| ) | |||
| # Sample negative train edges | |||
| train_neg_indices = np.delete(indices, test_neg_indices) | |||
| train_row, train_col, train_data = ( | |||
| all_row[train_neg_indices], | |||
| all_col[train_neg_indices], | |||
| all_data[train_neg_indices] | |||
| ) | |||
| train_neg = sp.coo_matrix( | |||
| (train_data, (train_row, train_col)), | |||
| shape=self.adj_mat.shape | |||
| ) | |||
| return train_neg, test_neg | |||
| class NewSampler: | |||
| """ | |||
| Samples train/test data and masks for a specific target dimension/index. | |||
| """ | |||
| def __init__( | |||
| self, | |||
| original_adj_mat: np.ndarray, | |||
| null_mask: np.ndarray, | |||
| target_dim: Optional[int], | |||
| target_index: int | |||
| ) -> None: | |||
| self.adj_mat = original_adj_mat | |||
| self.null_mask = null_mask | |||
| self.dim = target_dim | |||
| self.target_index = target_index | |||
| self.train_data, self.test_data = self._sample_train_test_data() | |||
| self.train_mask, self.test_mask = self._sample_train_test_mask() | |||
| def _sample_target_test_index(self) -> np.ndarray: | |||
| """Samples indices for positive test edges based on target dimension.""" | |||
| if self.dim: | |||
| return np.where(self.adj_mat[:, self.target_index] == 1)[0] | |||
| return np.where(self.adj_mat[self.target_index, :] == 1)[0] | |||
| def _sample_train_test_data(self) -> Tuple[torch.Tensor, torch.Tensor]: | |||
| """Samples train and test data based on target indices.""" | |||
| test_data = np.zeros(self.adj_mat.shape, dtype=np.float32) | |||
| test_index = self._sample_target_test_index() | |||
| if self.dim: | |||
| test_data[test_index, self.target_index] = 1 | |||
| else: | |||
| test_data[self.target_index, test_index] = 1 | |||
| train_data = self.adj_mat - test_data | |||
| return torch.from_numpy(train_data), torch.from_numpy(test_data) | |||
| def _sample_train_test_mask(self) -> Tuple[torch.Tensor, torch.Tensor]: | |||
| """Creates train and test masks, including negative sampling.""" | |||
| test_index = self._sample_target_test_index() | |||
| neg_value = np.ones(self.adj_mat.shape, dtype=np.float32) - self.adj_mat - self.null_mask | |||
| neg_test_mask = np.zeros(self.adj_mat.shape, dtype=np.float32) | |||
| if self.dim: | |||
| target_neg_index = np.where(neg_value[:, self.target_index] == 1)[0] | |||
| else: | |||
| target_neg_index = np.where(neg_value[self.target_index, :] == 1)[0] | |||
| target_neg_test_index = ( | |||
| np.random.choice(target_neg_index, len(test_index), replace=False) | |||
| if len(test_index) < len(target_neg_index) | |||
| else target_neg_index | |||
| ) | |||
| if self.dim: | |||
| neg_test_mask[target_neg_test_index, self.target_index] = 1 | |||
| neg_value[:, self.target_index] = 0 | |||
| else: | |||
| neg_test_mask[self.target_index, target_neg_test_index] = 1 | |||
| neg_value[self.target_index, :] = 0 | |||
| train_mask = (self.train_data.numpy() + neg_value).astype(bool) | |||
| test_mask = (self.test_data.numpy() + neg_test_mask).astype(bool) | |||
| return torch.from_numpy(train_mask), torch.from_numpy(test_mask) | |||
| class SingleSampler: | |||
| """ | |||
| Samples train/test data and masks for a specific target index. | |||
| Returns results as torch.Tensor. | |||
| """ | |||
| def __init__( | |||
| self, | |||
| origin_adj_mat: np.ndarray, | |||
| null_mask: np.ndarray, | |||
| target_index: int, | |||
| train_index: np.ndarray, | |||
| test_index: np.ndarray | |||
| ) -> None: | |||
| self.adj_mat = origin_adj_mat | |||
| self.null_mask = null_mask | |||
| self.target_index = target_index | |||
| self.train_index = train_index | |||
| self.test_index = test_index | |||
| self.train_data, self.test_data = self._sample_train_test_data() | |||
| self.train_mask, self.test_mask = self._sample_train_test_mask() | |||
| def _sample_train_test_data(self) -> Tuple[torch.Tensor, torch.Tensor]: | |||
| """Samples train and test data for the target index.""" | |||
| test_data = np.zeros(self.adj_mat.shape, dtype=np.float32) | |||
| test_data[self.test_index, self.target_index] = 1 | |||
| train_data = self.adj_mat - test_data | |||
| return torch.from_numpy(train_data), torch.from_numpy(test_data) | |||
| def _sample_train_test_mask(self) -> Tuple[torch.Tensor, torch.Tensor]: | |||
| """Creates train and test masks with negative sampling.""" | |||
| neg_value = np.ones(self.adj_mat.shape, dtype=np.float32) - self.adj_mat - self.null_mask | |||
| neg_test_mask = np.zeros(self.adj_mat.shape, dtype=np.float32) | |||
| target_neg_index = np.where(neg_value[:, self.target_index] == 1)[0] | |||
| target_neg_test_index = np.random.choice(target_neg_index, len(self.test_index), replace=False) | |||
| neg_test_mask[target_neg_test_index, self.target_index] = 1 | |||
| neg_value[target_neg_test_index, self.target_index] = 0 | |||
| train_mask = (self.train_data.numpy() + neg_value).astype(bool) | |||
| test_mask = (self.test_data.numpy() + neg_test_mask).astype(bool) | |||
| return torch.from_numpy(train_mask), torch.from_numpy(test_mask) | |||
| class TargetSampler: | |||
| """ | |||
| Samples train/test data and masks for multiple target indices. | |||
| """ | |||
| def __init__( | |||
| self, | |||
| response_mat: np.ndarray, | |||
| null_mask: np.ndarray, | |||
| target_indexes: np.ndarray, | |||
| pos_train_index: np.ndarray, | |||
| pos_test_index: np.ndarray | |||
| ) -> None: | |||
| self.response_mat = response_mat | |||
| self.null_mask = null_mask | |||
| self.target_indexes = target_indexes | |||
| self.pos_train_index = pos_train_index | |||
| self.pos_test_index = pos_test_index | |||
| self.train_data, self.test_data = self._sample_train_test_data() | |||
| self.train_mask, self.test_mask = self._sample_train_test_mask() | |||
| def _sample_train_test_data(self) -> Tuple[torch.Tensor, torch.Tensor]: | |||
| """Samples train and test data for multiple target indices.""" | |||
| n_target = self.target_indexes.shape[0] | |||
| target_response = self.response_mat[:, self.target_indexes].reshape((-1, n_target)) | |||
| train_data = self.response_mat.copy() | |||
| train_data[:, self.target_indexes] = 0 | |||
| target_pos_value = sp.coo_matrix(target_response) | |||
| target_train_data = sp.coo_matrix( | |||
| ( | |||
| target_pos_value.data[self.pos_train_index], | |||
| (target_pos_value.row[self.pos_train_index], target_pos_value.col[self.pos_train_index]) | |||
| ), | |||
| shape=target_response.shape | |||
| ).toarray() | |||
| target_test_data = sp.coo_matrix( | |||
| ( | |||
| target_pos_value.data[self.pos_test_index], | |||
| (target_pos_value.row[self.pos_test_index], target_pos_value.col[self.pos_test_index]) | |||
| ), | |||
| shape=target_response.shape | |||
| ).toarray() | |||
| test_data = np.zeros(self.response_mat.shape, dtype=np.float32) | |||
| for i, value in enumerate(self.target_indexes): | |||
| train_data[:, value] = target_train_data[:, i] | |||
| test_data[:, value] = target_test_data[:, i] | |||
| return torch.from_numpy(train_data), torch.from_numpy(test_data) | |||
| def _sample_train_test_mask(self) -> Tuple[torch.Tensor, torch.Tensor]: | |||
| """Creates train and test masks with negative sampling for target indices.""" | |||
| target_response = self.response_mat[:, self.target_indexes] | |||
| target_ones = np.ones(target_response.shape, dtype=np.float32) | |||
| target_neg_value = target_ones - target_response - self.null_mask[:, self.target_indexes] | |||
| target_neg_value = sp.coo_matrix(target_neg_value) | |||
| ids = np.arange(target_neg_value.data.shape[0]) | |||
| target_neg_test_index = np.random.choice(ids, self.pos_test_index.shape[0], replace=False) | |||
| target_neg_test_mask = sp.coo_matrix( | |||
| ( | |||
| target_neg_value.data[target_neg_test_index], | |||
| (target_neg_value.row[target_neg_test_index], target_neg_value.col[target_neg_test_index]) | |||
| ), | |||
| shape=target_response.shape | |||
| ).toarray() | |||
| neg_test_mask = np.zeros(self.response_mat.shape, dtype=np.float32) | |||
| for i, value in enumerate(self.target_indexes): | |||
| neg_test_mask[:, value] = target_neg_test_mask[:, i] | |||
| other_neg_value = ( | |||
| np.ones(self.response_mat.shape, dtype=np.float32) | |||
| - neg_test_mask | |||
| - self.response_mat | |||
| - self.null_mask | |||
| ) | |||
| test_mask = (self.test_data.numpy() + neg_test_mask).astype(bool) | |||
| train_mask = (self.train_data.numpy() + other_neg_value).astype(bool) | |||
| return torch.from_numpy(test_mask), torch.from_numpy(train_mask) | |||
| class ExterSampler: | |||
| """ | |||
| Samples train/test data and masks based on row indices. | |||
| """ | |||
| def __init__( | |||
| self, | |||
| original_adj_mat: np.ndarray, | |||
| null_mask: np.ndarray, | |||
| train_index: np.ndarray, | |||
| test_index: np.ndarray | |||
| ) -> None: | |||
| self.adj_mat = original_adj_mat | |||
| self.null_mask = null_mask | |||
| self.train_index = train_index | |||
| self.test_index = test_index | |||
| self.train_data, self.test_data = self._sample_train_test_data() | |||
| self.train_mask, self.test_mask = self._sample_train_test_mask() | |||
| def _sample_train_test_data(self) -> Tuple[torch.Tensor, torch.Tensor]: | |||
| """Samples train and test data based on row indices.""" | |||
| test_data = self.adj_mat.copy() | |||
| test_data[self.train_index, :] = 0 | |||
| train_data = self.adj_mat - test_data | |||
| return torch.from_numpy(train_data), torch.from_numpy(test_data) | |||
| def _sample_train_test_mask(self) -> Tuple[torch.Tensor, torch.Tensor]: | |||
| """Creates train and test masks with negative sampling.""" | |||
| neg_value = np.ones(self.adj_mat.shape, dtype=np.float32) - self.adj_mat - self.null_mask | |||
| neg_train = neg_value.copy() | |||
| neg_train[self.test_index, :] = 0 | |||
| neg_test = neg_value.copy() | |||
| neg_test[self.train_index, :] = 0 | |||
| train_mask = (self.train_data.numpy() + neg_train).astype(bool) | |||
| test_mask = (self.test_data.numpy() + neg_test).astype(bool) | |||
| return torch.from_numpy(train_mask), torch.from_numpy(test_mask) | |||
| class RegressionSampler(object): | |||
| def __init__(self, adj_mat_original, train_index, test_index, null_mask): | |||
| super(RegressionSampler, self).__init__() | |||
| if isinstance(adj_mat_original, torch.Tensor): | |||
| adj_mat_np = adj_mat_original.cpu().numpy() | |||
| else: | |||
| adj_mat_np = adj_mat_original.copy() | |||
| self.full_data = torch.FloatTensor(adj_mat_np) | |||
| rows, cols = adj_mat_np.shape | |||
| train_mask = np.zeros((rows, cols), dtype=bool) | |||
| test_mask = np.zeros((rows, cols), dtype=bool) | |||
| for idx in train_index: | |||
| row = idx // cols | |||
| col = idx % cols | |||
| if not null_mask[row, col]: | |||
| train_mask[row, col] = True | |||
| for idx in test_index: | |||
| row = idx // cols | |||
| col = idx % cols | |||
| if not null_mask[row, col]: | |||
| test_mask[row, col] = True | |||
| self.train_mask = torch.BoolTensor(train_mask) | |||
| self.test_mask = torch.BoolTensor(test_mask) | |||
| self.train_data = self.full_data.clone() | |||
| self.test_data = self.full_data.clone() | |||
| assert not torch.any(self.train_mask & self.test_mask), "Train and test masks have overlap!" | |||
| def get_train_indices(self): | |||
| indices = torch.nonzero(self.train_mask) | |||
| return indices | |||
| def get_test_indices(self): | |||
| indices = torch.nonzero(self.test_mask) | |||
| return indices | |||
| @@ -0,0 +1,8 @@ | |||
| pubchempy | |||
| torch==1.13.0 | |||
| numpy | |||
| scipy | |||
| pandas | |||
| scikit-learn | |||
| seaborn | |||
| hickle | |||
| @@ -0,0 +1,401 @@ | |||
| import os | |||
| import time | |||
| from typing import Tuple, List, Union, Optional | |||
| import torch | |||
| import numpy as np | |||
| import pandas as pd | |||
| import torch.nn as nn | |||
| import seaborn as sns | |||
| import pubchempy as pcp | |||
| import scipy.sparse as sp | |||
| from sklearn.metrics import roc_auc_score, average_precision_score | |||
| import itertools as it | |||
| import torch.nn.functional as F | |||
| # ---------------------------------------------------------------------------- | |||
| # Model Evaluation Functions | |||
| # ---------------------------------------------------------------------------- | |||
| def roc_auc(true_data: torch.Tensor, predict_data: torch.Tensor) -> float: | |||
| """Calculate ROC-AUC score for binary classification.""" | |||
| assert torch.all((true_data >= 0) & (true_data <= 1)), "True labels must be 0 or 1" | |||
| return roc_auc_score(true_data.cpu().numpy(), predict_data.cpu().numpy()) | |||
| def ap_score(true_data: torch.Tensor, predict_data: torch.Tensor) -> float: | |||
| """Calculate Average Precision (area under Precision-Recall curve).""" | |||
| assert torch.all((true_data >= 0) & (true_data <= 1)), "True labels must be 0 or 1" | |||
| return average_precision_score(true_data.cpu().numpy(), predict_data.cpu().numpy()) | |||
| def f1_score_binary(true_data: torch.Tensor, predict_data: torch.Tensor) -> Tuple[float, float]: | |||
| """Calculate F1 score and the optimal threshold for binary classification.""" | |||
| thresholds = torch.unique(predict_data) | |||
| n_samples = true_data.size(0) | |||
| ones = torch.ones((thresholds.size(0), n_samples), device=true_data.device) | |||
| zeros = torch.zeros((thresholds.size(0), n_samples), device=true_data.device) | |||
| predict_value = torch.where(predict_data.view(1, -1) >= thresholds.view(-1, 1), ones, zeros) | |||
| tpn = torch.sum(torch.where(predict_value == true_data.view(1, -1), ones, zeros), dim=1) | |||
| tp = torch.sum(predict_value * true_data.view(1, -1), dim=1) | |||
| scores = (2 * tp) / (n_samples + 2 * tp - tpn) | |||
| max_f1_score = torch.max(scores) | |||
| threshold = thresholds[torch.argmax(scores)] | |||
| return max_f1_score.item(), threshold.item() | |||
| def accuracy_binary(true_data: torch.Tensor, predict_data: torch.Tensor, threshold: float) -> float: | |||
| """Calculate accuracy using the specified threshold.""" | |||
| predict_value = torch.where(predict_data >= threshold, 1.0, 0.0) | |||
| correct = torch.sum(predict_value == true_data).float() | |||
| return (correct / true_data.size(0)).item() | |||
| def precision_binary(true_data: torch.Tensor, predict_data: torch.Tensor, threshold: float) -> float: | |||
| """Calculate precision using the specified threshold.""" | |||
| predict_value = torch.where(predict_data >= threshold, 1.0, 0.0) | |||
| tp = torch.sum(true_data * predict_value) | |||
| fp = torch.sum((1 - true_data) * predict_value) | |||
| return (tp / (tp + fp + 1e-8)).item() | |||
| def recall_binary(true_data: torch.Tensor, predict_data: torch.Tensor, threshold: float) -> float: | |||
| """Calculate recall using the specified threshold.""" | |||
| predict_value = torch.where(predict_data >= threshold, 1.0, 0.0) | |||
| tp = torch.sum(true_data * predict_value) | |||
| fn = torch.sum(true_data * (1 - predict_value)) | |||
| return (tp / (tp + fn + 1e-8)).item() | |||
| def mcc_binary(true_data: torch.Tensor, predict_data: torch.Tensor, threshold: float) -> float: | |||
| """Calculate Matthews Correlation Coefficient (MCC) using the specified threshold.""" | |||
| predict_value = torch.where(predict_data >= threshold, 1.0, 0.0) | |||
| true_neg = 1 - true_data | |||
| predict_neg = 1 - predict_value | |||
| tp = torch.sum(true_data * predict_value) | |||
| tn = torch.sum(true_neg * predict_neg) | |||
| fp = torch.sum(true_neg * predict_value) | |||
| fn = torch.sum(true_data * predict_neg) | |||
| denominator = torch.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn) + 1e-8) | |||
| return ((tp * tn - fp * fn) / denominator).item() | |||
| def evaluate_all(true_data: torch.Tensor, predict_data: torch.Tensor) -> Tuple[float, ...]: | |||
| """Evaluate multiple metrics: ROC-AUC, AP, accuracy, F1, and MCC.""" | |||
| assert torch.all((true_data >= 0) & (true_data <= 1)), "True labels must be 0 or 1" | |||
| auc = roc_auc(true_data, predict_data) | |||
| ap = ap_score(true_data, predict_data) | |||
| f1, threshold = f1_score_binary(true_data, predict_data) | |||
| acc = accuracy_binary(true_data, predict_data, threshold) | |||
| precision = precision_binary(true_data, predict_data, threshold) | |||
| recall = recall_binary(true_data, predict_data, threshold) | |||
| mcc = mcc_binary(true_data, predict_data, threshold) | |||
| return auc, ap, acc, f1, mcc, precision, recall | |||
| def evaluate_auc(true_data: torch.Tensor, predict_data: torch.Tensor) -> Tuple[float, float]: | |||
| """Calculate ROC-AUC and Average Precision.""" | |||
| assert torch.all((true_data >= 0) & (true_data <= 1)), "True labels must be 0 or 1" | |||
| auc = roc_auc_score(true_data.cpu().numpy(), predict_data.cpu().numpy()) | |||
| ap = average_precision_score(true_data.cpu().numpy(), predict_data.cpu().numpy()) | |||
| return auc, ap | |||
| # ---------------------------------------------------------------------------- | |||
| # Loss Functions | |||
| # ---------------------------------------------------------------------------- | |||
| def cross_entropy_loss(true_data: torch.Tensor, predict_data: torch.Tensor, masked: torch.Tensor) -> torch.Tensor: | |||
| """Calculate masked binary cross-entropy loss.""" | |||
| masked = masked.to(torch.bool) | |||
| true_data = torch.masked_select(true_data, masked) | |||
| pred_data = torch.masked_select(predict_data, masked) | |||
| return nn.BCELoss()(pred_data, true_data) | |||
| def mse_loss(true_data: torch.Tensor, predict_data: torch.Tensor, masked: torch.Tensor) -> torch.Tensor: | |||
| """Calculate masked mean squared error loss.""" | |||
| true_data = true_data * masked | |||
| predict_data = predict_data * masked | |||
| return nn.MSELoss()(predict_data, true_data) | |||
| def prototypical_loss( | |||
| cell_emb: torch.Tensor, | |||
| drug_emb: torch.Tensor, | |||
| adj_matrix: Union[torch.Tensor, np.ndarray], | |||
| margin: float = 2.0 | |||
| ) -> torch.Tensor: | |||
| """Calculate prototypical loss for positive and negative pairs.""" | |||
| if isinstance(adj_matrix, torch.Tensor): | |||
| adj_matrix = sp.coo_matrix(adj_matrix.detach().cpu().numpy()) | |||
| pos_pairs = torch.sum(cell_emb[adj_matrix.row] * drug_emb[adj_matrix.col], dim=1) | |||
| n_pos = len(adj_matrix.row) | |||
| cell_neg = torch.randint(0, cell_emb.size(0), (n_pos,), device=cell_emb.device) | |||
| drug_neg = torch.randint(0, drug_emb.size(0), (n_pos,), device=drug_emb.device) | |||
| neg_pairs = torch.sum(cell_emb[cell_neg] * drug_emb[drug_neg], dim=1) | |||
| labels = torch.ones_like(pos_pairs, device=cell_emb.device) | |||
| return F.margin_ranking_loss(pos_pairs, neg_pairs, labels, margin=margin) | |||
| # ---------------------------------------------------------------------------- | |||
| # Correlation and Normalization Functions | |||
| # ---------------------------------------------------------------------------- | |||
| def torch_z_normalized(tensor: torch.Tensor, dim: int = 0) -> torch.Tensor: | |||
| """Apply z-normalization along the specified dimension.""" | |||
| mean = tensor.mean(dim=1 - dim, keepdim=True) | |||
| std = tensor.std(dim=1 - dim, keepdim=True) + 1e-8 | |||
| return (tensor - mean) / std | |||
| def torch_corr_x_y(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor: | |||
| """Compute correlation matrix between row vectors of two matrices.""" | |||
| x_center = x - x.mean(dim=1, keepdim=True) | |||
| y_center = y - y.mean(dim=1, keepdim=True) | |||
| x_std = x.std(dim=1, keepdim=True) + 1e-8 | |||
| y_std = y.std(dim=1, keepdim=True) + 1e-8 | |||
| x_norm = x_center / x_std | |||
| y_norm = y_center / y_std | |||
| corr_matrix = x_norm @ y_norm.t() / (x.size(1) - 1) | |||
| return corr_matrix | |||
| # ---------------------------------------------------------------------------- | |||
| # Distance and Similarity Functions | |||
| # ---------------------------------------------------------------------------- | |||
| def torch_euclidean_dist(tensor: torch.Tensor, dim: int = 0) -> torch.Tensor: | |||
| """Calculate Euclidean distance between rows or columns of a tensor.""" | |||
| tensor_mul = torch.mm(tensor.t(), tensor) if dim else torch.mm(tensor, tensor.t()) | |||
| diag = torch.diag(tensor_mul) | |||
| n_diag = diag.size(0) | |||
| tensor_diag = diag.repeat(n_diag, 1) | |||
| diag = diag.view(n_diag, -1) | |||
| dist = torch.sqrt(tensor_diag + diag - 2 * tensor_mul) | |||
| return dist | |||
| def exp_similarity(tensor: torch.Tensor, sigma: torch.Tensor, normalize: bool = True) -> torch.Tensor: | |||
| """Calculate exponential similarity based on Euclidean distance.""" | |||
| if normalize: | |||
| tensor = torch_z_normalized(tensor, dim=1) | |||
| tensor_dist = torch_euclidean_dist(tensor, dim=0) | |||
| return torch.exp(-tensor_dist / (2 * sigma.pow(2))) | |||
| def full_kernel(exp_dist: torch.Tensor) -> torch.Tensor: | |||
| """Calculate full kernel matrix from exponential similarity.""" | |||
| n = exp_dist.shape[0] | |||
| ones = torch.ones(n, n, device=exp_dist.device) | |||
| diag = torch.diag(ones) | |||
| mask_diag = (ones - diag) * exp_dist | |||
| mask_diag_sum = mask_diag.sum(dim=1, keepdim=True) | |||
| mask_diag = mask_diag / (2 * mask_diag_sum) + 0.5 * diag | |||
| return mask_diag | |||
| def sparse_kernel(exp_dist: torch.Tensor, k: int) -> torch.Tensor: | |||
| """Calculate sparse kernel using k-nearest neighbors.""" | |||
| n = exp_dist.shape[0] | |||
| maxk = torch.topk(exp_dist, k, dim=1) | |||
| mink_indices = torch.topk(exp_dist, n - k, dim=1, largest=False).indices | |||
| exp_dist[torch.arange(n, device=exp_dist.device).view(n, -1), mink_indices] = 0 | |||
| knn_sum = maxk.values.sum(dim=1, keepdim=True) | |||
| return exp_dist / knn_sum | |||
| def scale_sigmoid(tensor: torch.Tensor, alpha: float) -> torch.Tensor: | |||
| """Apply scaled sigmoid transformation.""" | |||
| alpha = torch.tensor(alpha, dtype=torch.float32, device=tensor.device) | |||
| return torch.sigmoid(alpha * tensor) | |||
| # ---------------------------------------------------------------------------- | |||
| # Data Processing and Helper Functions | |||
| # ---------------------------------------------------------------------------- | |||
| def init_seeds(seed: int = 0) -> None: | |||
| """Initialize random seeds for reproducibility.""" | |||
| np.random.seed(seed) | |||
| torch.manual_seed(seed) | |||
| torch.cuda.manual_seed(seed) | |||
| torch.cuda.manual_seed_all(seed) | |||
| torch.backends.cudnn.deterministic = True | |||
| torch.backends.cudnn.benchmark = False | |||
| def distribute_compute( | |||
| lr_list: List[float], | |||
| wd_list: List[float], | |||
| scale_list: List[float], | |||
| layer_size: List[int], | |||
| sigma_list: List[float], | |||
| beta_list: List[float], | |||
| workers: int, | |||
| id: int | |||
| ) -> np.ndarray: | |||
| """Distribute hyperparameter combinations across workers.""" | |||
| all_combinations = [ | |||
| [lr, wd, sc, la, sg, bt] | |||
| for lr, wd, sc, la, sg, bt in it.product(lr_list, wd_list, scale_list, layer_size, sigma_list, beta_list) | |||
| ] | |||
| return np.array_split(all_combinations, workers)[id] | |||
| def get_fingerprint(cid: int) -> np.ndarray: | |||
| """Retrieve PubChem fingerprint for a given compound CID.""" | |||
| compound = pcp.Compound.from_cid(cid) | |||
| fingerprint = "".join(f"{int(bit, 16):04b}" for bit in compound.fingerprint) | |||
| return np.array([int(b) for b in fingerprint], dtype=np.int32) | |||
| def save_fingerprint(cid_list: List[int], last_cid: int, fpath: str) -> None: | |||
| """Save fingerprints for a list of compound CIDs to disk.""" | |||
| start_idx = np.where(np.array(cid_list) == last_cid)[0][0] + 1 if last_cid > 0 else 0 | |||
| for cid in cid_list[start_idx:]: | |||
| fingerprint = get_fingerprint(cid) | |||
| np.save(os.path.join(fpath, str(cid)), fingerprint) | |||
| print(f"CID {cid} processed successfully.") | |||
| time.sleep(1) | |||
| if start_idx >= len(cid_list): | |||
| print("All compounds have been processed!") | |||
| def read_fingerprint_cid(path: str) -> Tuple[np.ndarray, List[int]]: | |||
| """Read fingerprints from .npy files in the specified directory.""" | |||
| fingerprint = [] | |||
| cids = [] | |||
| for file_name in sorted(os.listdir(path)): | |||
| if file_name.endswith(".npy"): | |||
| cid = int(file_name.split(".")[0]) | |||
| fing = np.load(os.path.join(path, file_name)) | |||
| fingerprint.append(fing) | |||
| cids.append(cid) | |||
| fingerprint = np.array(fingerprint).reshape(-1, 920) | |||
| return fingerprint, cids | |||
| def common_data_index(data_for_index: np.ndarray, data_for_cmp: np.ndarray) -> np.ndarray: | |||
| """Find indices of elements in data_for_index that exist in data_for_cmp.""" | |||
| return np.where(np.isin(data_for_index, data_for_cmp))[0] | |||
| def to_coo_matrix(adj_mat: Union[np.ndarray, sp.coo_matrix]) -> sp.coo_matrix: | |||
| """Convert input matrix to scipy.sparse.coo_matrix format.""" | |||
| if not sp.isspmatrix_coo(adj_mat): | |||
| adj_mat = sp.coo_matrix(adj_mat) | |||
| return adj_mat | |||
| def mask(positive: sp.coo_matrix, negative: sp.coo_matrix, dtype: type = int) -> torch.Tensor: | |||
| """Create a mask combining positive and negative edges.""" | |||
| row = np.hstack((positive.row, negative.row)) | |||
| col = np.hstack((positive.col, negative.col)) | |||
| data = np.ones_like(row) | |||
| masked = sp.coo_matrix((data, (row, col)), shape=positive.shape).toarray().astype(dtype) | |||
| return torch.from_numpy(masked) | |||
| def to_tensor(positive: sp.coo_matrix, identity: bool = False) -> torch.Tensor: | |||
| """Convert sparse matrix to torch.Tensor, optionally adding identity matrix.""" | |||
| data = positive + sp.identity(positive.shape[0]) if identity else positive | |||
| return torch.from_numpy(data.toarray()).float() | |||
| def np_delete_value(arr: np.ndarray, obj: np.ndarray) -> np.ndarray: | |||
| """Remove specified values from a NumPy array.""" | |||
| indices = [np.where(arr == x)[0][0] for x in obj if x in arr] | |||
| return np.delete(arr, indices) | |||
| def translate_result(tensor: Union[torch.Tensor, np.ndarray]) -> pd.DataFrame: | |||
| """Convert tensor or array to a pandas DataFrame.""" | |||
| if isinstance(tensor, torch.Tensor): | |||
| tensor = tensor.detach().cpu().numpy() | |||
| return pd.DataFrame(tensor.reshape(1, -1)) | |||
| def calculate_train_test_index( | |||
| response: np.ndarray, | |||
| pos_train_index: np.ndarray, | |||
| pos_test_index: np.ndarray | |||
| ) -> Tuple[np.ndarray, np.ndarray]: | |||
| """Calculate train and test indices combining positive and negative samples.""" | |||
| neg_response_index = np.where(response == 0)[0] | |||
| neg_test_index = np.random.choice(neg_response_index, pos_test_index.shape[0], replace=False) | |||
| neg_train_index = np_delete_value(neg_response_index, neg_test_index) | |||
| test_index = np.hstack((pos_test_index, neg_test_index)) | |||
| train_index = np.hstack((pos_train_index, neg_train_index)) | |||
| return train_index, test_index | |||
| def dir_path(k: int = 1) -> str: | |||
| """Get directory path by traversing k levels up from current file.""" | |||
| fpath = os.path.realpath(__file__) | |||
| dir_name = os.path.dirname(fpath).replace("\\", "/") | |||
| for _ in range(k): | |||
| dir_name = os.path.dirname(dir_name) | |||
| return dir_name | |||
| def extract_row_data(data: pd.DataFrame, row: int) -> np.ndarray: | |||
| """Extract non-NaN data from a specific row of a DataFrame.""" | |||
| target = np.array(data.iloc[row], dtype=np.float32) | |||
| return target[~np.isnan(target)] | |||
| def transfer_data(data: pd.DataFrame, label: str) -> pd.DataFrame: | |||
| """Add a label column to a DataFrame.""" | |||
| data = data.copy() | |||
| data["label"] = label | |||
| return data | |||
| def link_data_frame(*data: pd.DataFrame) -> pd.DataFrame: | |||
| """Concatenate multiple DataFrames vertically.""" | |||
| return pd.concat(data, ignore_index=True) | |||
| def calculate_limit(*data: pd.DataFrame, key: Union[str, int]) -> Tuple[float, float]: | |||
| """Calculate min and max values of a key across multiple DataFrames.""" | |||
| temp = pd.concat(data, ignore_index=True) | |||
| return temp[key].min() - 0.1, temp[key].max() + 0.1 | |||
| def delete_all_sub_str(string: str, sub: str, join_str: str = "") -> str: | |||
| """Remove all occurrences of a substring and join with specified string.""" | |||
| parts = string.split(sub) | |||
| parts = [p for p in parts if p] | |||
| return join_str.join(parts) | |||
| def get_best_index(fname: str) -> int: | |||
| """Find the index of the AUC closest to the average AUC from a results file.""" | |||
| with open(fname, "r") as file: | |||
| content = file.read().replace("\n", "") | |||
| auc_str = content.split("accs")[0].split(":")[1] | |||
| auc_str = delete_all_sub_str(auc_str, " ", ",").replace(",]", "]") | |||
| aucs = np.array(eval(auc_str)) | |||
| avg_auc = float(content.split("avg_aucs")[1].split(":")[1].split()[0]) | |||
| return np.argmin(np.abs(aucs - avg_auc)) | |||
| def gather_color_code(*string: str) -> List[Tuple[float, float, float]]: | |||
| """Map color names to seaborn color palette codes.""" | |||
| color_str = ["bluea0"] = ["blue", "orange", "green", "red", "purple", "brown", "pink", "grey", "yellow", "cyan"] | |||
| palette = sns.color_palette() | |||
| color_map = dict(zip(color_str, palette)) | |||
| return [color_map[color] for color in string] | |||