import json import numpy as np import os import torch from tqdm import tqdm from transformers import (BertTokenizer, BertModel, GPT2Tokenizer, GPT2Model, RobertaTokenizer, RobertaModel, ElectraTokenizer, ElectraModel, DistilBertTokenizer, DistilBertModel) models = { 'bert-base-uncased': (BertTokenizer, BertModel), } #json_path = 'Data/val.json' json_path = 'Data/train.json' output_dir = 'text-features' with open(json_path, 'r') as f: data = json.load(f) counter = 0 for model_name, (Tokenizer, Model) in models.items(): tokenizer = Tokenizer.from_pretrained(model_name) max_size = tokenizer.max_model_input_sizes[model_name] model = Model.from_pretrained(model_name) model.to("cuda") for datapoint in tqdm(data): instructions = " ".join(datapoint['instructions']) if "gpt" in model_name: tokenized_instructions = tokenizer.encode(instructions, add_special_tokens=True)[:max_size] else: tokenized_instructions = [tokenizer.encode(instructions, add_special_tokens=True)[:max_size]] input_ids = torch.tensor(tokenized_instructions) # print(input_ids.shape) with torch.no_grad(): outputs = model(input_ids.to("cuda")) if "gpt" in model_name: embeddings = outputs.last_hidden_state[0, :].detach().cpu().numpy() else: embeddings = outputs.last_hidden_state[:, 0, :].detach().cpu().numpy() # print(embeddings.shape) output_filename = '{}.npy'.format(datapoint['id']) output_path = os.path.join(output_dir, model_name, output_filename) os.makedirs(os.path.dirname(output_path), exist_ok=True) np.save(output_path, embeddings)