BSc project of Parham Saremi. The goal of the project was to detect the geographical region of the food using textual and visual features extracted from recipes and ingredients of the food.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

extract_recipe_vector.py 1.8KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950
  1. import json
  2. import numpy as np
  3. import os
  4. import torch
  5. from tqdm import tqdm
  6. from transformers import (BertTokenizer, BertModel,
  7. GPT2Tokenizer, GPT2Model,
  8. RobertaTokenizer, RobertaModel,
  9. ElectraTokenizer, ElectraModel,
  10. DistilBertTokenizer, DistilBertModel)
  11. models = {
  12. 'bert-base-uncased': (BertTokenizer, BertModel),
  13. }
  14. #json_path = 'Data/val.json'
  15. json_path = 'Data/train.json'
  16. output_dir = 'text-features'
  17. with open(json_path, 'r') as f:
  18. data = json.load(f)
  19. counter = 0
  20. for model_name, (Tokenizer, Model) in models.items():
  21. tokenizer = Tokenizer.from_pretrained(model_name)
  22. max_size = tokenizer.max_model_input_sizes[model_name]
  23. model = Model.from_pretrained(model_name)
  24. model.to("cuda")
  25. for datapoint in tqdm(data):
  26. instructions = " ".join(datapoint['instructions'])
  27. if "gpt" in model_name:
  28. tokenized_instructions = tokenizer.encode(instructions, add_special_tokens=True)[:max_size]
  29. else:
  30. tokenized_instructions = [tokenizer.encode(instructions, add_special_tokens=True)[:max_size]]
  31. input_ids = torch.tensor(tokenized_instructions)
  32. # print(input_ids.shape)
  33. with torch.no_grad():
  34. outputs = model(input_ids.to("cuda"))
  35. if "gpt" in model_name:
  36. embeddings = outputs.last_hidden_state[0, :].detach().cpu().numpy()
  37. else:
  38. embeddings = outputs.last_hidden_state[:, 0, :].detach().cpu().numpy()
  39. # print(embeddings.shape)
  40. output_filename = '{}.npy'.format(datapoint['id'])
  41. output_path = os.path.join(output_dir, model_name, output_filename)
  42. os.makedirs(os.path.dirname(output_path), exist_ok=True)
  43. np.save(output_path, embeddings)