4 months ago · 608be828ab
--- a/utils.py
+++ b/utils.py
 import os
 import time
 from typing import Tuple, List, Union, Optional
 import torch
 import numpy as np
 import pandas as pd
 import torch.nn as nn
 import seaborn as sns
 import pubchempy as pcp
 import scipy.sparse as sp
 from sklearn.metrics import roc_auc_score, average_precision_score
 import itertools as it
 import torch.nn.functional as F
 # ----------------------------------------------------------------------------
 # Model Evaluation Functions
 # ----------------------------------------------------------------------------
 def roc_auc(true_data: torch.Tensor, predict_data: torch.Tensor) -> float:
    """Calculate ROC-AUC score for binary classification."""
    assert torch.all((true_data >= 0) & (true_data <= 1)), "True labels must be 0 or 1"
    return roc_auc_score(true_data.cpu().numpy(), predict_data.cpu().numpy())
 def ap_score(true_data: torch.Tensor, predict_data: torch.Tensor) -> float:
    """Calculate Average Precision (area under Precision-Recall curve)."""
    assert torch.all((true_data >= 0) & (true_data <= 1)), "True labels must be 0 or 1"
    return average_precision_score(true_data.cpu().numpy(), predict_data.cpu().numpy())
 def f1_score_binary(true_data: torch.Tensor, predict_data: torch.Tensor) -> Tuple[float, float]:
    """Calculate F1 score and the optimal threshold for binary classification."""
    thresholds = torch.unique(predict_data)
    n_samples = true_data.size(0)
    ones = torch.ones((thresholds.size(0), n_samples), device=true_data.device)
    zeros = torch.zeros((thresholds.size(0), n_samples), device=true_data.device)
    predict_value = torch.where(predict_data.view(1, -1) >= thresholds.view(-1, 1), ones, zeros)
    tpn = torch.sum(torch.where(predict_value == true_data.view(1, -1), ones, zeros), dim=1)
    tp = torch.sum(predict_value * true_data.view(1, -1), dim=1)
    scores = (2 * tp) / (n_samples + 2 * tp - tpn)
    max_f1_score = torch.max(scores)
    threshold = thresholds[torch.argmax(scores)]
    return max_f1_score.item(), threshold.item()
 def accuracy_binary(true_data: torch.Tensor, predict_data: torch.Tensor, threshold: float) -> float:
    """Calculate accuracy using the specified threshold."""
    predict_value = torch.where(predict_data >= threshold, 1.0, 0.0)
    correct = torch.sum(predict_value == true_data).float()
    return (correct / true_data.size(0)).item()
 def precision_binary(true_data: torch.Tensor, predict_data: torch.Tensor, threshold: float) -> float:
    """Calculate precision using the specified threshold."""
    predict_value = torch.where(predict_data >= threshold, 1.0, 0.0)
    tp = torch.sum(true_data * predict_value)
    fp = torch.sum((1 - true_data) * predict_value)
    return (tp / (tp + fp + 1e-8)).item()
 def recall_binary(true_data: torch.Tensor, predict_data: torch.Tensor, threshold: float) -> float:
    """Calculate recall using the specified threshold."""
    predict_value = torch.where(predict_data >= threshold, 1.0, 0.0)
    tp = torch.sum(true_data * predict_value)
    fn = torch.sum(true_data * (1 - predict_value))
    return (tp / (tp + fn + 1e-8)).item()
 def mcc_binary(true_data: torch.Tensor, predict_data: torch.Tensor, threshold: float) -> float:
    """Calculate Matthews Correlation Coefficient (MCC) using the specified threshold."""
    predict_value = torch.where(predict_data >= threshold, 1.0, 0.0)
    true_neg = 1 - true_data
    predict_neg = 1 - predict_value
    tp = torch.sum(true_data * predict_value)
    tn = torch.sum(true_neg * predict_neg)
    fp = torch.sum(true_neg * predict_value)
    fn = torch.sum(true_data * predict_neg)
    denominator = torch.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn) + 1e-8)
    return ((tp * tn - fp * fn) / denominator).item()
 def evaluate_all(true_data: torch.Tensor, predict_data: torch.Tensor) -> Tuple[float, ...]:
    """Evaluate multiple metrics: ROC-AUC, AP, accuracy, F1, and MCC."""
    assert torch.all((true_data >= 0) & (true_data <= 1)), "True labels must be 0 or 1"
    auc = roc_auc(true_data, predict_data)
    ap = ap_score(true_data, predict_data)
    f1, threshold = f1_score_binary(true_data, predict_data)
    acc = accuracy_binary(true_data, predict_data, threshold)
    precision = precision_binary(true_data, predict_data, threshold)
    recall = recall_binary(true_data, predict_data, threshold)
    mcc = mcc_binary(true_data, predict_data, threshold)
    return auc, ap, acc, f1, mcc, precision, recall
 def evaluate_auc(true_data: torch.Tensor, predict_data: torch.Tensor) -> Tuple[float, float]:
    """Calculate ROC-AUC and Average Precision."""
    assert torch.all((true_data >= 0) & (true_data <= 1)), "True labels must be 0 or 1"
    auc = roc_auc_score(true_data.cpu().numpy(), predict_data.cpu().numpy())
    ap = average_precision_score(true_data.cpu().numpy(), predict_data.cpu().numpy())
    return auc, ap
 # ----------------------------------------------------------------------------
 # Loss Functions
 # ----------------------------------------------------------------------------
 def cross_entropy_loss(true_data: torch.Tensor, predict_data: torch.Tensor, masked: torch.Tensor) -> torch.Tensor:
    """Calculate masked binary cross-entropy loss."""
    masked = masked.to(torch.bool)
    true_data = torch.masked_select(true_data, masked)
    pred_data = torch.masked_select(predict_data, masked)
    return nn.BCELoss()(pred_data, true_data)
 def mse_loss(true_data: torch.Tensor, predict_data: torch.Tensor, masked: torch.Tensor) -> torch.Tensor:
    """Calculate masked mean squared error loss."""
    true_data = true_data * masked
    predict_data = predict_data * masked
    return nn.MSELoss()(predict_data, true_data)
 def prototypical_loss(
    cell_emb: torch.Tensor,
    drug_emb: torch.Tensor,
    adj_matrix: Union[torch.Tensor, np.ndarray],
    margin: float = 2.0
 ) -> torch.Tensor:
    """Calculate prototypical loss for positive and negative pairs."""
    if isinstance(adj_matrix, torch.Tensor):
        adj_matrix = sp.coo_matrix(adj_matrix.detach().cpu().numpy())
    pos_pairs = torch.sum(cell_emb[adj_matrix.row] * drug_emb[adj_matrix.col], dim=1)
    n_pos = len(adj_matrix.row)
    cell_neg = torch.randint(0, cell_emb.size(0), (n_pos,), device=cell_emb.device)
    drug_neg = torch.randint(0, drug_emb.size(0), (n_pos,), device=drug_emb.device)
    neg_pairs = torch.sum(cell_emb[cell_neg] * drug_emb[drug_neg], dim=1)
    labels = torch.ones_like(pos_pairs, device=cell_emb.device)
    return F.margin_ranking_loss(pos_pairs, neg_pairs, labels, margin=margin)
 # ----------------------------------------------------------------------------
 # Correlation and Normalization Functions
 # ----------------------------------------------------------------------------
 def torch_z_normalized(tensor: torch.Tensor, dim: int = 0) -> torch.Tensor:
    """Apply z-normalization along the specified dimension."""
    mean = tensor.mean(dim=1 - dim, keepdim=True)
    std = tensor.std(dim=1 - dim, keepdim=True) + 1e-8
    return (tensor - mean) / std
 def torch_corr_x_y(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
    """Compute correlation matrix between row vectors of two matrices."""
    x_center = x - x.mean(dim=1, keepdim=True)
    y_center = y - y.mean(dim=1, keepdim=True)
    x_std = x.std(dim=1, keepdim=True) + 1e-8
    y_std = y.std(dim=1, keepdim=True) + 1e-8
    x_norm = x_center / x_std
    y_norm = y_center / y_std
    corr_matrix = x_norm @ y_norm.t() / (x.size(1) - 1)
    return corr_matrix
 # ----------------------------------------------------------------------------
 # Distance and Similarity Functions
 # ----------------------------------------------------------------------------
 def torch_euclidean_dist(tensor: torch.Tensor, dim: int = 0) -> torch.Tensor:
    """Calculate Euclidean distance between rows or columns of a tensor."""
    tensor_mul = torch.mm(tensor.t(), tensor) if dim else torch.mm(tensor, tensor.t())
    diag = torch.diag(tensor_mul)
    n_diag = diag.size(0)
    tensor_diag = diag.repeat(n_diag, 1)
    diag = diag.view(n_diag, -1)
    dist = torch.sqrt(tensor_diag + diag - 2 * tensor_mul)
    return dist
 def exp_similarity(tensor: torch.Tensor, sigma: torch.Tensor, normalize: bool = True) -> torch.Tensor:
    """Calculate exponential similarity based on Euclidean distance."""
    if normalize:
        tensor = torch_z_normalized(tensor, dim=1)
    tensor_dist = torch_euclidean_dist(tensor, dim=0)
    return torch.exp(-tensor_dist / (2 * sigma.pow(2)))
 def full_kernel(exp_dist: torch.Tensor) -> torch.Tensor:
    """Calculate full kernel matrix from exponential similarity."""
    n = exp_dist.shape[0]
    ones = torch.ones(n, n, device=exp_dist.device)
    diag = torch.diag(ones)
    mask_diag = (ones - diag) * exp_dist
    mask_diag_sum = mask_diag.sum(dim=1, keepdim=True)
    mask_diag = mask_diag / (2 * mask_diag_sum) + 0.5 * diag
    return mask_diag
 def sparse_kernel(exp_dist: torch.Tensor, k: int) -> torch.Tensor:
    """Calculate sparse kernel using k-nearest neighbors."""
    n = exp_dist.shape[0]
    maxk = torch.topk(exp_dist, k, dim=1)
    mink_indices = torch.topk(exp_dist, n - k, dim=1, largest=False).indices
    exp_dist[torch.arange(n, device=exp_dist.device).view(n, -1), mink_indices] = 0
    knn_sum = maxk.values.sum(dim=1, keepdim=True)
    return exp_dist / knn_sum
 def scale_sigmoid(tensor: torch.Tensor, alpha: float) -> torch.Tensor:
    """Apply scaled sigmoid transformation."""
    alpha = torch.tensor(alpha, dtype=torch.float32, device=tensor.device)
    return torch.sigmoid(alpha * tensor)
 # ----------------------------------------------------------------------------
 # Data Processing and Helper Functions
 # ----------------------------------------------------------------------------
 def init_seeds(seed: int = 0) -> None:
    """Initialize random seeds for reproducibility."""
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
 def distribute_compute(
    lr_list: List[float],
    wd_list: List[float],
    scale_list: List[float],
    layer_size: List[int],
    sigma_list: List[float],
    beta_list: List[float],
    workers: int,
    id: int
 ) -> np.ndarray:
    """Distribute hyperparameter combinations across workers."""
    all_combinations = [
        [lr, wd, sc, la, sg, bt]
        for lr, wd, sc, la, sg, bt in it.product(lr_list, wd_list, scale_list, layer_size, sigma_list, beta_list)
    ]
    return np.array_split(all_combinations, workers)[id]
 def get_fingerprint(cid: int) -> np.ndarray:
    """Retrieve PubChem fingerprint for a given compound CID."""
    compound = pcp.Compound.from_cid(cid)
    fingerprint = "".join(f"{int(bit, 16):04b}" for bit in compound.fingerprint)
    return np.array([int(b) for b in fingerprint], dtype=np.int32)
 def save_fingerprint(cid_list: List[int], last_cid: int, fpath: str) -> None:
    """Save fingerprints for a list of compound CIDs to disk."""
    start_idx = np.where(np.array(cid_list) == last_cid)[0][0] + 1 if last_cid > 0 else 0
    for cid in cid_list[start_idx:]:
        fingerprint = get_fingerprint(cid)
        np.save(os.path.join(fpath, str(cid)), fingerprint)
        print(f"CID {cid} processed successfully.")
        time.sleep(1)
    if start_idx >= len(cid_list):
        print("All compounds have been processed!")
 def read_fingerprint_cid(path: str) -> Tuple[np.ndarray, List[int]]:
    """Read fingerprints from .npy files in the specified directory."""
    fingerprint = []
    cids = []
    for file_name in sorted(os.listdir(path)):
        if file_name.endswith(".npy"):
            cid = int(file_name.split(".")[0])
            fing = np.load(os.path.join(path, file_name))
            fingerprint.append(fing)
            cids.append(cid)
    fingerprint = np.array(fingerprint).reshape(-1, 920)
    return fingerprint, cids
 def common_data_index(data_for_index: np.ndarray, data_for_cmp: np.ndarray) -> np.ndarray:
    """Find indices of elements in data_for_index that exist in data_for_cmp."""
    return np.where(np.isin(data_for_index, data_for_cmp))[0]
 def to_coo_matrix(adj_mat: Union[np.ndarray, sp.coo_matrix]) -> sp.coo_matrix:
    """Convert input matrix to scipy.sparse.coo_matrix format."""
    if not sp.isspmatrix_coo(adj_mat):
        adj_mat = sp.coo_matrix(adj_mat)
    return adj_mat
 def mask(positive: sp.coo_matrix, negative: sp.coo_matrix, dtype: type = int) -> torch.Tensor:
    """Create a mask combining positive and negative edges."""
    row = np.hstack((positive.row, negative.row))
    col = np.hstack((positive.col, negative.col))
    data = np.ones_like(row)
    masked = sp.coo_matrix((data, (row, col)), shape=positive.shape).toarray().astype(dtype)
    return torch.from_numpy(masked)
 def to_tensor(positive: sp.coo_matrix, identity: bool = False) -> torch.Tensor:
    """Convert sparse matrix to torch.Tensor, optionally adding identity matrix."""
    data = positive + sp.identity(positive.shape[0]) if identity else positive
    return torch.from_numpy(data.toarray()).float()
 def np_delete_value(arr: np.ndarray, obj: np.ndarray) -> np.ndarray:
    """Remove specified values from a NumPy array."""
    indices = [np.where(arr == x)[0][0] for x in obj if x in arr]
    return np.delete(arr, indices)
 def translate_result(tensor: Union[torch.Tensor, np.ndarray]) -> pd.DataFrame:
    """Convert tensor or array to a pandas DataFrame."""
    if isinstance(tensor, torch.Tensor):
        tensor = tensor.detach().cpu().numpy()
    return pd.DataFrame(tensor.reshape(1, -1))
 def calculate_train_test_index(
    response: np.ndarray,
    pos_train_index: np.ndarray,
    pos_test_index: np.ndarray
 ) -> Tuple[np.ndarray, np.ndarray]:
    """Calculate train and test indices combining positive and negative samples."""
    neg_response_index = np.where(response == 0)[0]
    neg_test_index = np.random.choice(neg_response_index, pos_test_index.shape[0], replace=False)
    neg_train_index = np_delete_value(neg_response_index, neg_test_index)
    test_index = np.hstack((pos_test_index, neg_test_index))
    train_index = np.hstack((pos_train_index, neg_train_index))
    return train_index, test_index
 def dir_path(k: int = 1) -> str:
    """Get directory path by traversing k levels up from current file."""
    fpath = os.path.realpath(__file__)
    dir_name = os.path.dirname(fpath).replace("\\", "/")
    for _ in range(k):
        dir_name = os.path.dirname(dir_name)
    return dir_name
 def extract_row_data(data: pd.DataFrame, row: int) -> np.ndarray:
    """Extract non-NaN data from a specific row of a DataFrame."""
    target = np.array(data.iloc[row], dtype=np.float32)
    return target[~np.isnan(target)]
 def transfer_data(data: pd.DataFrame, label: str) -> pd.DataFrame:
    """Add a label column to a DataFrame."""
    data = data.copy()
    data["label"] = label
    return data
 def link_data_frame(*data: pd.DataFrame) -> pd.DataFrame:
    """Concatenate multiple DataFrames vertically."""
    return pd.concat(data, ignore_index=True)
 def calculate_limit(*data: pd.DataFrame, key: Union[str, int]) -> Tuple[float, float]:
    """Calculate min and max values of a key across multiple DataFrames."""
    temp = pd.concat(data, ignore_index=True)
    return temp[key].min() - 0.1, temp[key].max() + 0.1
 def delete_all_sub_str(string: str, sub: str, join_str: str = "") -> str:
    """Remove all occurrences of a substring and join with specified string."""
    parts = string.split(sub)
    parts = [p for p in parts if p]
    return join_str.join(parts)
 def get_best_index(fname: str) -> int:
    """Find the index of the AUC closest to the average AUC from a results file."""
    with open(fname, "r") as file:
        content = file.read().replace("\n", "")
    auc_str = content.split("accs")[0].split(":")[1]
    auc_str = delete_all_sub_str(auc_str, " ", ",").replace(",]", "]")
    aucs = np.array(eval(auc_str))
    avg_auc = float(content.split("avg_aucs")[1].split(":")[1].split()[0])
    return np.argmin(np.abs(aucs - avg_auc))
 def gather_color_code(*string: str) -> List[Tuple[float, float, float]]:
    """Map color names to seaborn color palette codes."""
    color_str = ["bluea0"] = ["blue", "orange", "green", "red", "purple", "brown", "pink", "grey", "yellow", "cyan"]
    palette = sns.color_palette()
    color_map = dict(zip(color_str, palette))
    return [color_map[color] for color in string]
 import os
 import time
 from typing import Tuple, List, Union, Optional
 import torch
 import numpy as np
 import pandas as pd
 import torch.nn as nn
 import seaborn as sns
 import pubchempy as pcp
 import scipy.sparse as sp
 from sklearn.metrics import roc_auc_score, average_precision_score
 import itertools as it
 import torch.nn.functional as F
 # ----------------------------------------------------------------------------
 # Model Evaluation Functions
 # ----------------------------------------------------------------------------
 def roc_auc(true_data: torch.Tensor, predict_data: torch.Tensor) -> float:
    """Calculate ROC-AUC score for binary classification."""
    assert torch.all((true_data >= 0) & (true_data <= 1)), "True labels must be 0 or 1"
    return roc_auc_score(true_data.cpu().numpy(), predict_data.cpu().numpy())
 def ap_score(true_data: torch.Tensor, predict_data: torch.Tensor) -> float:
    """Calculate Average Precision (area under Precision-Recall curve)."""
    assert torch.all((true_data >= 0) & (true_data <= 1)), "True labels must be 0 or 1"
    return average_precision_score(true_data.cpu().numpy(), predict_data.cpu().numpy())
 def f1_score_binary(true_data: torch.Tensor, predict_data: torch.Tensor) -> Tuple[float, float]:
    """Calculate F1 score and the optimal threshold for binary classification."""
    thresholds = torch.unique(predict_data)
    n_samples = true_data.size(0)
    ones = torch.ones((thresholds.size(0), n_samples), device=true_data.device)
    zeros = torch.zeros((thresholds.size(0), n_samples), device=true_data.device)
    predict_value = torch.where(predict_data.view(1, -1) >= thresholds.view(-1, 1), ones, zeros)
    tpn = torch.sum(torch.where(predict_value == true_data.view(1, -1), ones, zeros), dim=1)
    tp = torch.sum(predict_value * true_data.view(1, -1), dim=1)
    scores = (2 * tp) / (n_samples + 2 * tp - tpn)
    max_f1_score = torch.max(scores)
    threshold = thresholds[torch.argmax(scores)]
    return max_f1_score.item(), threshold.item()
 def accuracy_binary(true_data: torch.Tensor, predict_data: torch.Tensor, threshold: float) -> float:
    """Calculate accuracy using the specified threshold."""
    predict_value = torch.where(predict_data >= threshold, 1.0, 0.0)
    correct = torch.sum(predict_value == true_data).float()
    return (correct / true_data.size(0)).item()
 def precision_binary(true_data: torch.Tensor, predict_data: torch.Tensor, threshold: float) -> float:
    """Calculate precision using the specified threshold."""
    predict_value = torch.where(predict_data >= threshold, 1.0, 0.0)
    tp = torch.sum(true_data * predict_value)
    fp = torch.sum((1 - true_data) * predict_value)
    return (tp / (tp + fp + 1e-8)).item()
 def recall_binary(true_data: torch.Tensor, predict_data: torch.Tensor, threshold: float) -> float:
    """Calculate recall using the specified threshold."""
    predict_value = torch.where(predict_data >= threshold, 1.0, 0.0)
    tp = torch.sum(true_data * predict_value)
    fn = torch.sum(true_data * (1 - predict_value))
    return (tp / (tp + fn + 1e-8)).item()
 def mcc_binary(true_data: torch.Tensor, predict_data: torch.Tensor, threshold: float) -> float:
    """Calculate Matthews Correlation Coefficient (MCC) using the specified threshold."""
    predict_value = torch.where(predict_data >= threshold, 1.0, 0.0)
    true_neg = 1 - true_data
    predict_neg = 1 - predict_value
    tp = torch.sum(true_data * predict_value)
    tn = torch.sum(true_neg * predict_neg)
    fp = torch.sum(true_neg * predict_value)
    fn = torch.sum(true_data * predict_neg)
    denominator = torch.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn) + 1e-8)
    return ((tp * tn - fp * fn) / denominator).item()
 def evaluate_all(true_data: torch.Tensor, predict_data: torch.Tensor) -> Tuple[float, ...]:
    """Evaluate multiple metrics: ROC-AUC, AP, accuracy, F1, and MCC."""
    assert torch.all((true_data >= 0) & (true_data <= 1)), "True labels must be 0 or 1"
    auc = roc_auc(true_data, predict_data)
    ap = ap_score(true_data, predict_data)
    f1, threshold = f1_score_binary(true_data, predict_data)
    acc = accuracy_binary(true_data, predict_data, threshold)
    precision = precision_binary(true_data, predict_data, threshold)
    recall = recall_binary(true_data, predict_data, threshold)
    mcc = mcc_binary(true_data, predict_data, threshold)
    return auc, ap, acc, f1, mcc, precision, recall
 def evaluate_auc(true_data: torch.Tensor, predict_data: torch.Tensor) -> Tuple[float, float]:
    """Calculate ROC-AUC and Average Precision."""
    assert torch.all((true_data >= 0) & (true_data <= 1)), "True labels must be 0 or 1"
    auc = roc_auc_score(true_data.cpu().numpy(), predict_data.cpu().numpy())
    ap = average_precision_score(true_data.cpu().numpy(), predict_data.cpu().numpy())
    return auc, ap
 # ----------------------------------------------------------------------------
 # Loss Functions
 # ----------------------------------------------------------------------------
 def cross_entropy_loss(true_data: torch.Tensor, predict_data: torch.Tensor, masked: torch.Tensor) -> torch.Tensor:
    """Calculate masked binary cross-entropy loss."""
    masked = masked.to(torch.bool)
    true_data = torch.masked_select(true_data, masked)
    pred_data = torch.masked_select(predict_data, masked)
    return nn.BCELoss()(pred_data, true_data)
 def mse_loss(true_data: torch.Tensor, predict_data: torch.Tensor, masked: torch.Tensor) -> torch.Tensor:
    """Calculate masked mean squared error loss."""
    true_data = true_data * masked
    predict_data = predict_data * masked
    return nn.MSELoss()(predict_data, true_data)
 def prototypical_loss(
    cell_emb: torch.Tensor,
    drug_emb: torch.Tensor,
    adj_matrix: Union[torch.Tensor, np.ndarray],
    margin: float = 2.0
 ) -> torch.Tensor:
    """Calculate prototypical loss for positive and negative pairs."""
    if isinstance(adj_matrix, torch.Tensor):
        adj_matrix = sp.coo_matrix(adj_matrix.detach().cpu().numpy())
    pos_pairs = torch.sum(cell_emb[adj_matrix.row] * drug_emb[adj_matrix.col], dim=1)
    n_pos = len(adj_matrix.row)
    cell_neg = torch.randint(0, cell_emb.size(0), (n_pos,), device=cell_emb.device)
    drug_neg = torch.randint(0, drug_emb.size(0), (n_pos,), device=drug_emb.device)
    neg_pairs = torch.sum(cell_emb[cell_neg] * drug_emb[drug_neg], dim=1)
    labels = torch.ones_like(pos_pairs, device=cell_emb.device)
    return F.margin_ranking_loss(pos_pairs, neg_pairs, labels, margin=margin)
 # ----------------------------------------------------------------------------
 # Correlation and Normalization Functions
 # ----------------------------------------------------------------------------
 def torch_z_normalized(tensor: torch.Tensor, dim: int = 0) -> torch.Tensor:
    """Apply z-normalization along the specified dimension."""
    mean = tensor.mean(dim=1 - dim, keepdim=True)
    std = tensor.std(dim=1 - dim, keepdim=True) + 1e-8
    return (tensor - mean) / std
 def torch_corr_x_y(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
    """Compute correlation matrix between row vectors of two matrices."""
    x_center = x - x.mean(dim=1, keepdim=True)
    y_center = y - y.mean(dim=1, keepdim=True)
    x_std = x.std(dim=1, keepdim=True) + 1e-8
    y_std = y.std(dim=1, keepdim=True) + 1e-8
    x_norm = x_center / x_std
    y_norm = y_center / y_std
    corr_matrix = x_norm @ y_norm.t() / (x.size(1) - 1)
    return corr_matrix
 # ----------------------------------------------------------------------------
 # Distance and Similarity Functions
 # ----------------------------------------------------------------------------
 def torch_euclidean_dist(tensor: torch.Tensor, dim: int = 0) -> torch.Tensor:
    """Calculate Euclidean distance between rows or columns of a tensor."""
    tensor_mul = torch.mm(tensor.t(), tensor) if dim else torch.mm(tensor, tensor.t())
    diag = torch.diag(tensor_mul)
    n_diag = diag.size(0)
    tensor_diag = diag.repeat(n_diag, 1)
    diag = diag.view(n_diag, -1)
    dist = torch.sqrt(tensor_diag + diag - 2 * tensor_mul)
    return dist
 def exp_similarity(tensor: torch.Tensor, sigma: torch.Tensor, normalize: bool = True) -> torch.Tensor:
    """Calculate exponential similarity based on Euclidean distance."""
    if normalize:
        tensor = torch_z_normalized(tensor, dim=1)
    tensor_dist = torch_euclidean_dist(tensor, dim=0)
    return torch.exp(-tensor_dist / (2 * sigma.pow(2)))
 def full_kernel(exp_dist: torch.Tensor) -> torch.Tensor:
    """Calculate full kernel matrix from exponential similarity."""
    n = exp_dist.shape[0]
    ones = torch.ones(n, n, device=exp_dist.device)
    diag = torch.diag(ones)
    mask_diag = (ones - diag) * exp_dist
    mask_diag_sum = mask_diag.sum(dim=1, keepdim=True)
    mask_diag = mask_diag / (2 * mask_diag_sum) + 0.5 * diag
    return mask_diag
 def sparse_kernel(exp_dist: torch.Tensor, k: int) -> torch.Tensor:
    """Calculate sparse kernel using k-nearest neighbors."""
    n = exp_dist.shape[0]
    maxk = torch.topk(exp_dist, k, dim=1)
    mink_indices = torch.topk(exp_dist, n - k, dim=1, largest=False).indices
    exp_dist[torch.arange(n, device=exp_dist.device).view(n, -1), mink_indices] = 0
    knn_sum = maxk.values.sum(dim=1, keepdim=True)
    return exp_dist / knn_sum
 def scale_sigmoid(tensor: torch.Tensor, alpha: float) -> torch.Tensor:
    """Apply scaled sigmoid transformation."""
    alpha = torch.tensor(alpha, dtype=torch.float32, device=tensor.device)
    return torch.sigmoid(alpha * tensor)
 # ----------------------------------------------------------------------------
 # Data Processing and Helper Functions
 # ----------------------------------------------------------------------------
 def init_seeds(seed: int = 0) -> None:
    """Initialize random seeds for reproducibility."""
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
 def distribute_compute(
    lr_list: List[float],
    wd_list: List[float],
    scale_list: List[float],
    layer_size: List[int],
    sigma_list: List[float],
    beta_list: List[float],
    workers: int,
    id: int
 ) -> np.ndarray:
    """Distribute hyperparameter combinations across workers."""
    all_combinations = [
        [lr, wd, sc, la, sg, bt]
        for lr, wd, sc, la, sg, bt in it.product(lr_list, wd_list, scale_list, layer_size, sigma_list, beta_list)
    ]
    return np.array_split(all_combinations, workers)[id]
 def get_fingerprint(cid: int) -> np.ndarray:
    """Retrieve PubChem fingerprint for a given compound CID."""
    compound = pcp.Compound.from_cid(cid)
    fingerprint = "".join(f"{int(bit, 16):04b}" for bit in compound.fingerprint)
    return np.array([int(b) for b in fingerprint], dtype=np.int32)
 def save_fingerprint(cid_list: List[int], last_cid: int, fpath: str) -> None:
    """Save fingerprints for a list of compound CIDs to disk."""
    start_idx = np.where(np.array(cid_list) == last_cid)[0][0] + 1 if last_cid > 0 else 0
    for cid in cid_list[start_idx:]:
        fingerprint = get_fingerprint(cid)
        np.save(os.path.join(fpath, str(cid)), fingerprint)
        print(f"CID {cid} processed successfully.")
        time.sleep(1)
    if start_idx >= len(cid_list):
        print("All compounds have been processed!")
 def read_fingerprint_cid(path: str) -> Tuple[np.ndarray, List[int]]:
    """Read fingerprints from .npy files in the specified directory."""
    fingerprint = []
    cids = []
    for file_name in sorted(os.listdir(path)):
        if file_name.endswith(".npy"):
            cid = int(file_name.split(".")[0])
            fing = np.load(os.path.join(path, file_name))
            fingerprint.append(fing)
            cids.append(cid)
    fingerprint = np.array(fingerprint).reshape(-1, 920)
    return fingerprint, cids
 def common_data_index(data_for_index: np.ndarray, data_for_cmp: np.ndarray) -> np.ndarray:
    """Find indices of elements in data_for_index that exist in data_for_cmp."""
    return np.where(np.isin(data_for_index, data_for_cmp))[0]
 def to_coo_matrix(adj_mat: Union[np.ndarray, sp.coo_matrix]) -> sp.coo_matrix:
    """Convert input matrix to scipy.sparse.coo_matrix format."""
    if not sp.isspmatrix_coo(adj_mat):
        adj_mat = sp.coo_matrix(adj_mat)
    return adj_mat
 def mask(positive: sp.coo_matrix, negative: sp.coo_matrix, dtype: type = int) -> torch.Tensor:
    """Create a mask combining positive and negative edges."""
    row = np.hstack((positive.row, negative.row))
    col = np.hstack((positive.col, negative.col))
    data = np.ones_like(row)
    masked = sp.coo_matrix((data, (row, col)), shape=positive.shape).toarray().astype(dtype)
    return torch.from_numpy(masked)
 def to_tensor(positive: sp.coo_matrix, identity: bool = False) -> torch.Tensor:
    """Convert sparse matrix to torch.Tensor, optionally adding identity matrix."""
    data = positive + sp.identity(positive.shape[0]) if identity else positive
    return torch.from_numpy(data.toarray()).float()
 def np_delete_value(arr: np.ndarray, obj: np.ndarray) -> np.ndarray:
    """Remove specified values from a NumPy array."""
    indices = [np.where(arr == x)[0][0] for x in obj if x in arr]
    return np.delete(arr, indices)
 def translate_result(tensor: Union[torch.Tensor, np.ndarray]) -> pd.DataFrame:
    """Convert tensor or array to a pandas DataFrame."""
    if isinstance(tensor, torch.Tensor):
        tensor = tensor.detach().cpu().numpy()
    return pd.DataFrame(tensor.reshape(1, -1))
 def calculate_train_test_index(
    response: np.ndarray,
    pos_train_index: np.ndarray,
    pos_test_index: np.ndarray
 ) -> Tuple[np.ndarray, np.ndarray]:
    """Calculate train and test indices combining positive and negative samples."""
    neg_response_index = np.where(response == 0)[0]
    neg_test_index = np.random.choice(neg_response_index, pos_test_index.shape[0], replace=False)
    neg_train_index = np_delete_value(neg_response_index, neg_test_index)
    test_index = np.hstack((pos_test_index, neg_test_index))
    train_index = np.hstack((pos_train_index, neg_train_index))
    return train_index, test_index
 def dir_path(k: int = 1) -> str:
    """Get directory path by traversing k levels up from current file."""
    fpath = os.path.realpath(__file__)
    dir_name = os.path.dirname(fpath).replace("\\", "/")
    for _ in range(k):
        dir_name = os.path.dirname(dir_name)
    return dir_name
 def extract_row_data(data: pd.DataFrame, row: int) -> np.ndarray:
    """Extract non-NaN data from a specific row of a DataFrame."""
    target = np.array(data.iloc[row], dtype=np.float32)
    return target[~np.isnan(target)]
 def transfer_data(data: pd.DataFrame, label: str) -> pd.DataFrame:
    """Add a label column to a DataFrame."""
    data = data.copy()
    data["label"] = label
    return data
 def link_data_frame(*data: pd.DataFrame) -> pd.DataFrame:
    """Concatenate multiple DataFrames vertically."""
    return pd.concat(data, ignore_index=True)
 def calculate_limit(*data: pd.DataFrame, key: Union[str, int]) -> Tuple[float, float]:
    """Calculate min and max values of a key across multiple DataFrames."""
    temp = pd.concat(data, ignore_index=True)
    return temp[key].min() - 0.1, temp[key].max() + 0.1
 def delete_all_sub_str(string: str, sub: str, join_str: str = "") -> str:
    """Remove all occurrences of a substring and join with specified string."""
    parts = string.split(sub)
    parts = [p for p in parts if p]
    return join_str.join(parts)
 def get_best_index(fname: str) -> int:
    """Find the index of the AUC closest to the average AUC from a results file."""
    with open(fname, "r") as file:
        content = file.read().replace("\n", "")
    auc_str = content.split("accs")[0].split(":")[1]
    auc_str = delete_all_sub_str(auc_str, " ", ",").replace(",]", "]")
    aucs = np.array(eval(auc_str))
    avg_auc = float(content.split("avg_aucs")[1].split(":")[1].split()[0])
    return np.argmin(np.abs(aucs - avg_auc))