# -*- Encoding:UTF-8 -*- import torch import torch.nn as nn import torch.optim as optim import numpy as np import argparse import os import heapq import math import random from DataSet import DataSet from BNN import * # Set device to GPU if available DEVICE = torch.device('cuda') if torch.cuda.is_available() else 'cpu' print(DEVICE) # Seed for reproducibility seed = 0 random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False class Model(nn.Module): def __init__(self, args): super(Model, self).__init__() self.dataName = args.dataName self.dataSet = DataSet(self.dataName) self.shape = self.dataSet.shape self.maxRate = self.dataSet.maxRate self.sample_size = 2 self.train_data = self.bootstrap_sample(self.sample_size) self.test_data = self.dataSet.test self.negNum = args.negNum self.testNeg = self.dataSet.getTestNeg(self.test_data, 99) self.userLayer = args.userLayer self.itemLayer = args.itemLayer self.user_item_embedding = nn.Parameter( torch.tensor(self.dataSet.getEmbedding(), dtype=torch.float32).to(DEVICE)) self.item_user_embedding = self.user_item_embedding.t().to(DEVICE) # Define User Layer self.user_layers = nn.ModuleList() input_size = self.shape[1] for size in self.userLayer: self.user_layers.append(BayesianLinear(input_size, size, prior_type=args.priorType, device=DEVICE)) input_size = size # Define Item Layer self.item_layers = nn.ModuleList() input_size = self.shape[0] for size in self.itemLayer: self.item_layers.append(BayesianLinear(input_size, size, prior_type=args.priorType, device=DEVICE)) input_size = size self.interaction_layer = nn.Sequential( nn.Linear(input_size, 64), nn.ReLU(), nn.Linear(64, 32), nn.ReLU(), nn.Linear(32, 1) ) self.attention = nn.MultiheadAttention(embed_dim=input_size, num_heads=4, dropout=0.3, batch_first=True) def forward(self, user, item): user_input = self.user_item_embedding[user] item_input = self.item_user_embedding[item] for layer in self.user_layers: user_input = torch.relu(layer(user_input)) for layer in self.item_layers: item_input = torch.relu(layer(item_input)) user_att = user_input.unsqueeze(1) # Shape: (batch_size, 1, embed_dim) item_att = item_input.unsqueeze(1) # Shape: (batch_size, 1, embed_dim) combined = user_att * item_att # Shape: (batch_size, 2, embed_dim) att_output, att_weights = self.attention( query=combined, key=combined, value=combined ) att_output = att_output.mean(dim=1) interaction_input = att_output y_hat = self.interaction_layer(interaction_input) y_hat = torch.sigmoid(y_hat) return torch.clamp(y_hat.squeeze(), min=1e-6, max=1.0) def log_prior(self, type): if type == "user": return sum(layer.log_prior for layer in self.user_layers) else: return sum(layer.log_prior for layer in self.item_layers) def log_variational_posterior(self, type): if type == "user": return sum(layer.log_variational_posterior for layer in self.user_layers) else: return sum(layer.log_variational_posterior for layer in self.item_layers) def sample_elbo(self, user_tensor, item_tensor, target, num_samples, num_batches): outputs = torch.zeros(num_samples, user_tensor.size(0), device=DEVICE) user_log_priors = torch.zeros(num_samples, device=DEVICE) user_log_variational_posteriors = torch.zeros(num_samples, device=DEVICE) item_log_priors = torch.zeros(num_samples, device=DEVICE) item_log_variational_posteriors = torch.zeros(num_samples, device=DEVICE) for i in range(num_samples): outputs[i] = self(user_tensor, item_tensor) user_log_priors[i] = self.log_prior(type="user") user_log_variational_posteriors[i] = self.log_variational_posterior(type="user") item_log_priors[i] = self.log_prior(type="item") item_log_variational_posteriors[i] = self.log_variational_posterior(type="item") user_log_prior = user_log_priors.mean() user_log_variational_posterior = user_log_variational_posteriors.mean() item_log_prior = item_log_priors.mean() item_log_variational_posterior = item_log_variational_posteriors.mean() item_loss = (item_log_variational_posterior - item_log_prior) user_loss = (user_log_variational_posterior - user_log_prior) return user_loss + item_loss def bootstrap_sample(self, sample_size): """ Generate a bootstrapped dataset by sampling with replacement. """ indices = np.random.choice(len(self.dataSet.train), size=len(self.dataSet.train) // sample_size, replace=True) sampled_train = [self.dataSet.train[i] for i in indices] return sampled_train class SuperModel(nn.Module): """ A super model that combines predictions from multiple ensemble models using a neural network. """ def __init__(self, ensemble_models, input_size): super(SuperModel, self).__init__() self.ensemble_models = ensemble_models self.combiner = nn.Sequential( nn.Linear(input_size, input_size // 2), nn.ReLU(), nn.Linear(input_size // 2, 1), nn.Sigmoid() # To ensure the output is a probability ) def forward(self, user, item): """ Forward pass of the super model. Combines predictions from ensemble models using a neural network. """ ensemble_predictions = [] with torch.no_grad(): # Ensure no gradients are computed for ensemble models for model in self.ensemble_models: model.eval() # Set individual models to evaluation mode predictions = model(user, item) ensemble_predictions.append(predictions) # Stack predictions to create input for the combiner network stacked_predictions = torch.stack(ensemble_predictions, dim=1) # Shape: (batch_size, num_ensemble_models) combined_predictions = self.combiner(stacked_predictions).squeeze(-1) # Shape: (batch_size,) return combined_predictions def run_epoch(model, optimizer, criterion, args): model.train() train_u, train_i, train_r = model.dataSet.getInstances(model.train_data, args.negNum) train_len = len(train_u) shuffled_idx = np.random.permutation(np.arange(train_len)) train_u, train_i, train_r = train_u[shuffled_idx], train_i[shuffled_idx], train_r[shuffled_idx] num_batches = (train_len + args.batchSize - 1) // args.batchSize BCE_losses, kls = [], [] for i in range(num_batches): min_idx = i * args.batchSize max_idx = min(train_len, (i + 1) * args.batchSize) user_tensor = torch.tensor(train_u[min_idx:max_idx], dtype=torch.long).to(DEVICE) item_tensor = torch.tensor(train_i[min_idx:max_idx], dtype=torch.long).to(DEVICE) rate_tensor = torch.tensor(train_r[min_idx:max_idx], dtype=torch.float32).to(DEVICE) rate_tensor = (rate_tensor - rate_tensor.min()) / (rate_tensor.max() - rate_tensor.min()) optimizer.zero_grad() y_hat = model(user_tensor, item_tensor) loss = criterion(y_hat, rate_tensor) BCE_losses.append(loss.item()) kl_coef = 4.42322e-08 loss += kl_coef * model.sample_elbo(user_tensor, item_tensor, rate_tensor, 5, num_batches) loss.backward() optimizer.step() kls.append(loss.item()) if i % 10 == 0: print(f'\rBatch {i}/{num_batches}: KL = {np.mean(kls[-10:]):.4f}, BCE = {np.mean(BCE_losses[-10:]):.4f}', end='') print(f"\nMean BCE Loss: {np.mean(BCE_losses):.4f}") print(f"Mean KL Divergence: {np.mean(kls):.4f}") return np.mean(kls) def evaluate(model, topK): model.eval() hr, NDCG = [], [] with torch.no_grad(): for i in range(len(model.testNeg[0])): user_tensor = model.testNeg[0][i] item_tensor = model.testNeg[1][i] predict = model(user_tensor, item_tensor) item_score_dict = {item: predict[j].item() for j, item in enumerate(item_tensor)} ranklist = heapq.nlargest(topK, item_score_dict, key=item_score_dict.get) hr.append(1 if item_tensor[0].item() in ranklist else 0) NDCG.append(math.log(2) / math.log(ranklist.index(item_tensor[0].item()) + 2) if item_tensor[0].item() in ranklist else 0) return np.mean(hr), np.mean(NDCG) def main(): parser = argparse.ArgumentParser(description="Options") parser.add_argument('-dataName', action='store', dest='dataName', default='ml-100k') parser.add_argument('-negNum', action='store', dest='negNum', default=5, type=int) parser.add_argument('-userLayer', action='store', dest='userLayer', default=[512, 64, 64], type=int, nargs='+') parser.add_argument('-itemLayer', action='store', dest='itemLayer', default=[1024, 64, 64], type=int, nargs='+') parser.add_argument('-lr', action='store', dest='lr', default=0.0001, type=float) parser.add_argument('-maxEpochs', action='store', dest='maxEpochs', default=50, type=int) parser.add_argument('-batchSize', action='store', dest='batchSize', default=256, type=int) parser.add_argument('-earlyStop', action='store', dest='earlyStop', default=5, type=int) parser.add_argument('-checkPoint', action='store', dest='checkPoint', default='./checkPoint/') parser.add_argument('-topK', action='store', dest='topK', default=10, type=int) parser.add_argument('-loadModel', action='store_true', dest='loadModel', help="Load a saved model") parser.add_argument('-ensembleSize', action='store', dest='ensembleSize', default=10, type=int) parser.add_argument('-maxEpochN', action='store', dest='maxEpochN', default=30, type=int) parser.add_argument('-priorType', action='store', dest='priorType', default='ScaleMixtureGaussian', choices=['ScaleMixtureGaussian', 'Laplace', 'IsotropicGaussian']) args = parser.parse_args() if not os.path.exists(args.checkPoint): os.mkdir(args.checkPoint) ensemble_models = [] optimizers = [] ensemble_args = [] network_layers = [[512, 64, 64], [512, 64], [1024, 64, 64], [512, 256, 64], [1024, 256, 256, 64]] prior_types = ['ScaleMixtureGaussian', 'Laplace', 'IsotropicGaussian'] for ensemble_idx in range(args.ensembleSize): args_copy = argparse.Namespace(**vars(args)) args_copy.userLayer = random.choice(network_layers) args_copy.itemLayer = random.choice(network_layers) args_copy.priorType = random.choice(prior_types) ensemble_model = Model(args_copy).to(DEVICE) ensemble_args.append(args_copy) optimizer = optim.Adam(ensemble_model.parameters(), lr=args.lr) ensemble_models.append(ensemble_model) optimizers.append(optimizer) criterion = nn.BCELoss() for ensemble_idx in range(args.ensembleSize): best_hr = -1 best_NDCG = -1 best_epoch = -1 print(f'Ensemble Model Number {ensemble_idx}') print("Start Training!") print(ensemble_args[ensemble_idx]) classifier = ensemble_models[ensemble_idx] optimizer = optimizers[ensemble_idx] for epoch in range(args.maxEpochs): print("=" * 20 + "Epoch " + str(epoch) + "=" * 20) run_epoch(classifier, optimizer, criterion, args) print('=' * 50) print("Start Evaluation!") hr, NDCG = evaluate(classifier, args.topK) print("Epoch ", epoch, "HR: {}, NDCG: {}".format(hr, NDCG)) if hr > best_hr or NDCG > best_NDCG: best_hr = hr best_NDCG = NDCG best_epoch = epoch torch.save(classifier.state_dict(), os.path.join(args.checkPoint, f'model{ensemble_idx}.pth')) if epoch - best_epoch > args.earlyStop: print("Normal Early stop!") break print("=" * 20 + "Epoch " + str(epoch) + " End" + "=" * 20) print("Best hr: {}, NDCG: {}, At Epoch {}".format(best_hr, best_NDCG, best_epoch)) print("Training complete!\n") for ensemble_idx in range(args.ensembleSize): model_path = os.path.join(args.checkPoint, f'model{ensemble_idx}.pth') if os.path.exists(model_path): print("Loading saved model from", model_path) ensemble_models[ensemble_idx].load_state_dict(torch.load(model_path)) else: print("No saved model found at", model_path) super_model = SuperModel(ensemble_models, input_size=len(ensemble_models)).to(DEVICE) for epoch in range(args.maxEpochN): train_super_model(super_model, ensemble_models[0].dataSet.train, args) print("\nStart Testing") total_hr, total_NDCG = ensemble_eval(ensemble_models, super_model, args.topK) print("total hr: {}, total NDCG: {}".format(total_hr, total_NDCG)) def train_super_model(super_model, train_data, args): super_model.train() optimizer = optim.Adam(super_model.parameters(), lr=args.lr) criterion = nn.BCELoss() train_u, train_i, train_r = super_model.ensemble_models[0].dataSet.getInstances(train_data, args.negNum) train_len = len(train_u) shuffled_idx = np.random.permutation(np.arange(train_len)) train_u = train_u[shuffled_idx] train_i = train_i[shuffled_idx] train_r = train_r[shuffled_idx] num_batches = len(train_u) // args.batchSize + 1 losses = [] for i in range(num_batches): min_idx = i * args.batchSize max_idx = min(train_len, (i + 1) * args.batchSize) user_tensor = torch.tensor(train_u[min_idx:max_idx], dtype=torch.long).to(DEVICE) item_tensor = torch.tensor(train_i[min_idx:max_idx], dtype=torch.long).to(DEVICE) rate_tensor = torch.tensor(train_r[min_idx:max_idx], dtype=torch.float32).to(DEVICE) rate_tensor = (rate_tensor - rate_tensor.min()) / (rate_tensor.max() - rate_tensor.min()) optimizer.zero_grad() y_hat = super_model(user_tensor, item_tensor) loss = criterion(y_hat, rate_tensor) loss.backward() optimizer.step() losses.append(loss.item()) if i % 10 == 0: print(f'\rBatch {i}/{num_batches}: loss = {np.mean(losses[-10:]):.4f}', end='') print("\nMean loss for super model in this epoch is: {}".format(np.mean(losses))) def ensemble_eval(ensemble_models, superModel,topK): def getHitRatio(ranklist, targetItem): return 1 if targetItem in ranklist else 0 def getNDCG(ranklist, targetItem): for i, item in enumerate(ranklist): if item == targetItem: return math.log(2) / math.log(i + 2) return 0 hr = [] NDCG = [] testUser = ensemble_models[0].testNeg[0] testItem = ensemble_models[0].testNeg[1] with torch.no_grad(): for i in range(len(testUser)): target = testItem[i][0] user_tensor = torch.tensor(testUser[i], dtype=torch.long).to(DEVICE) item_tensor = torch.tensor(testItem[i], dtype=torch.long).to(DEVICE) # ensemble_predicts = [] # for model in ensemble_models: # predict = model(user_tensor, item_tensor) # ensemble_predicts.append(predict) total_predict = superModel(user_tensor, item_tensor) # print(total_predict) item_score_dict = {item: total_predict[j].item() for j, item in enumerate(testItem[i])} ranklist = heapq.nlargest(topK, item_score_dict, key=item_score_dict.get) tmp_hr = getHitRatio(ranklist, target) tmp_NDCG = getNDCG(ranklist, target) hr.append(tmp_hr) NDCG.append(tmp_NDCG) return np.mean(hr), np.mean(NDCG) if __name__ == '__main__': main()