You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

datasets.py 4.7KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115
  1. import numpy as np
  2. import torch
  3. import random
  4. from torch.utils.data import Dataset
  5. from .utils import read_map
  6. class FastTensorDataLoader:
  7. """
  8. A DataLoader-like object for a set of tensors that can be much faster than
  9. TensorDataset + DataLoader because dataloader grabs individual indices of
  10. the dataset and calls cat (slow).
  11. Source: https://discuss.pytorch.org/t/dataloader-much-slower-than-manual-batching/27014/6
  12. """
  13. def __init__(self, *tensors, batch_size=32, shuffle=False):
  14. """
  15. Initialize a FastTensorDataLoader.
  16. :param *tensors: tensors to store. Must have the same length @ dim 0.
  17. :param batch_size: batch size to load.
  18. :param shuffle: if True, shuffle the data *in-place* whenever an
  19. iterator is created out of this object.
  20. :returns: A FastTensorDataLoader.
  21. """
  22. assert all(t.shape[0] == tensors[0].shape[0] for t in tensors)
  23. self.tensors = tensors
  24. self.dataset_len = self.tensors[0].shape[0]
  25. self.batch_size = batch_size
  26. self.shuffle = shuffle
  27. # Calculate # batches
  28. n_batches, remainder = divmod(self.dataset_len, self.batch_size)
  29. if remainder > 0:
  30. n_batches += 1
  31. self.n_batches = n_batches
  32. def __iter__(self):
  33. if self.shuffle:
  34. r = torch.randperm(self.dataset_len)
  35. self.tensors = [t[r] for t in self.tensors]
  36. self.i = 0
  37. return self
  38. def __next__(self):
  39. if self.i >= self.dataset_len:
  40. raise StopIteration
  41. batch = tuple(t[self.i:self.i + self.batch_size] for t in self.tensors)
  42. self.i += self.batch_size
  43. return batch
  44. def __len__(self):
  45. return self.n_batches
  46. class FastSynergyDataset(Dataset):
  47. def __init__(self, drug2id_file, cell2id_file, drug_feat_file, cell_feat_file, synergy_score_file, use_folds,
  48. train=True):
  49. self.drug2id = read_map(drug2id_file)
  50. self.cell2id = read_map(cell2id_file)
  51. self.drug_feat = np.load(drug_feat_file)
  52. self.cell_feat = np.load(cell_feat_file)
  53. self.samples = []
  54. self.raw_samples = []
  55. self.train = train
  56. valid_drugs = set(self.drug2id.keys())
  57. valid_cells = set(self.cell2id.keys())
  58. with open(synergy_score_file, 'r') as f:
  59. f.readline()
  60. for line in f:
  61. drug1, drug2, cellname, score, fold = line.rstrip().split('\t')
  62. if drug1 in valid_drugs and drug2 in valid_drugs and cellname in valid_cells:
  63. if int(fold) in use_folds:
  64. sample = [
  65. # TODO: specify drug_feat
  66. torch.from_numpy(self.drug_feat[self.drug2id[drug1]]).float(),
  67. torch.from_numpy(self.drug_feat[self.drug2id[drug2]]).float(),
  68. torch.from_numpy(self.cell_feat[self.cell2id[cellname]]).float(),
  69. torch.FloatTensor([float(score)]),
  70. ]
  71. self.samples.append(sample)
  72. raw_sample = [self.drug2id[drug1], self.drug2id[drug2], self.cell2id[cellname], score]
  73. self.raw_samples.append(raw_sample)
  74. if train:
  75. sample = [
  76. torch.from_numpy(self.drug_feat[self.drug2id[drug2]]).float(),
  77. torch.from_numpy(self.drug_feat[self.drug2id[drug1]]).float(),
  78. torch.from_numpy(self.cell_feat[self.cell2id[cellname]]).float(),
  79. torch.FloatTensor([float(score)]),
  80. ]
  81. self.samples.append(sample)
  82. raw_sample = [self.drug2id[drug2], self.drug2id[drug1], self.cell2id[cellname], score]
  83. self.raw_samples.append(raw_sample)
  84. def __len__(self):
  85. return len(self.samples)
  86. def __getitem__(self, item):
  87. return self.samples[item]
  88. def drug_feat_len(self):
  89. return self.drug_feat.shape[-1]
  90. def cell_feat_len(self):
  91. return self.cell_feat.shape[-1]
  92. def tensor_samples(self, indices=None):
  93. if indices is None:
  94. indices = list(range(len(self)))
  95. d1 = torch.cat([torch.unsqueeze(self.samples[i][0], 0) for i in indices], dim=0)
  96. d2 = torch.cat([torch.unsqueeze(self.samples[i][1], 0) for i in indices], dim=0)
  97. c = torch.cat([torch.unsqueeze(self.samples[i][2], 0) for i in indices], dim=0)
  98. y = torch.cat([torch.unsqueeze(self.samples[i][3], 0) for i in indices], dim=0)
  99. return d1, d2, c, y