Bayesian Deep Ensemble Collaborative Filtering
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

DataSet.py 3.9KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122
  1. # -*- Encoding:UTF-8 -*-
  2. import numpy as np
  3. import sys
  4. class DataSet(object):
  5. def __init__(self, fileName):
  6. self.data, self.shape = self.getData(fileName)
  7. self.train, self.test = self.getTrainTest()
  8. self.trainDict = self.getTrainDict()
  9. def getData(self, fileName):
  10. if fileName == 'ml-1m' or fileName == 'ml-100k':
  11. # print(f"Loading {fileName} data set...")
  12. if fileName == 'ml-1m':
  13. filePath = './Data/ml-1m/ratings.dat'
  14. separator = '::'
  15. else:
  16. filePath = './Data/ml-100k/u.data'
  17. separator = '\t'
  18. data = []
  19. u = 0
  20. i = 0
  21. maxr = 0.0
  22. with open(filePath, 'r') as f:
  23. for line in f:
  24. if line:
  25. lines = line[:-1].split(separator)
  26. user = int(lines[0])
  27. movie = int(lines[1])
  28. score = float(lines[2])
  29. time = int(lines[3])
  30. data.append((user, movie, score, time))
  31. if user > u:
  32. u = user
  33. if movie > i:
  34. i = movie
  35. if score > maxr:
  36. maxr = score
  37. self.maxRate = maxr
  38. # print("Loading Success!\n"
  39. # "Data Info:\n"
  40. # "\tUser Num: {}\n"
  41. # "\tItem Num: {}\n"
  42. # "\tData Size: {}".format(u, i, len(data)))
  43. return data, [u, i]
  44. else:
  45. print("Current data set is not support!")
  46. sys.exit()
  47. def getTrainTest(self):
  48. data = self.data
  49. data = sorted(data, key=lambda x: (x[0], x[3]))
  50. train = []
  51. test = []
  52. for i in range(len(data)-1):
  53. user = data[i][0]-1
  54. item = data[i][1]-1
  55. rate = data[i][2]
  56. if data[i][0] != data[i+1][0]:
  57. test.append((user, item, rate))
  58. else:
  59. train.append((user, item, rate))
  60. test.append((data[-1][0]-1, data[-1][1]-1, data[-1][2]))
  61. return train, test
  62. def getTrainDict(self):
  63. dataDict = {}
  64. for i in self.train:
  65. dataDict[(i[0], i[1])] = i[2]
  66. return dataDict
  67. def getEmbedding(self):
  68. train_matrix = np.zeros([self.shape[0], self.shape[1]], dtype=np.float32)
  69. for i in self.train:
  70. user = i[0]
  71. movie = i[1]
  72. rating = i[2]
  73. train_matrix[user][movie] = rating
  74. return np.array(train_matrix)
  75. def getInstances(self, data, negNum):
  76. user = []
  77. item = []
  78. rate = []
  79. for i in data:
  80. user.append(i[0])
  81. item.append(i[1])
  82. rate.append(i[2])
  83. for t in range(negNum):
  84. j = np.random.randint(self.shape[1])
  85. while (i[0], j) in self.trainDict:
  86. j = np.random.randint(self.shape[1])
  87. user.append(i[0])
  88. item.append(j)
  89. rate.append(0.0)
  90. return np.array(user), np.array(item), np.array(rate)
  91. def getTestNeg(self, testData, negNum):
  92. user = []
  93. item = []
  94. for s in testData:
  95. tmp_user = []
  96. tmp_item = []
  97. u = s[0]
  98. i = s[1]
  99. tmp_user.append(u)
  100. tmp_item.append(i)
  101. neglist = set()
  102. neglist.add(i)
  103. for t in range(negNum):
  104. j = np.random.randint(self.shape[1])
  105. while (u, j) in self.trainDict or j in neglist:
  106. j = np.random.randint(self.shape[1])
  107. neglist.add(j)
  108. tmp_user.append(u)
  109. tmp_item.append(j)
  110. user.append(tmp_user)
  111. item.append(tmp_item)
  112. return [np.array(user), np.array(item)]