1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950 |
- import json
- import numpy as np
- import os
- import torch
- from tqdm import tqdm
-
- from transformers import (BertTokenizer, BertModel,
- GPT2Tokenizer, GPT2Model,
- RobertaTokenizer, RobertaModel,
- ElectraTokenizer, ElectraModel,
- DistilBertTokenizer, DistilBertModel)
-
- models = {
- 'bert-base-uncased': (BertTokenizer, BertModel),
- }
-
- #json_path = 'Data/val.json'
- json_path = 'Data/train.json'
- output_dir = 'text-features'
-
- with open(json_path, 'r') as f:
- data = json.load(f)
-
- counter = 0
- for model_name, (Tokenizer, Model) in models.items():
- tokenizer = Tokenizer.from_pretrained(model_name)
- max_size = tokenizer.max_model_input_sizes[model_name]
- model = Model.from_pretrained(model_name)
- model.to("cuda")
-
- for datapoint in tqdm(data):
- instructions = " ".join(datapoint['instructions'])
- if "gpt" in model_name:
- tokenized_instructions = tokenizer.encode(instructions, add_special_tokens=True)[:max_size]
- else:
- tokenized_instructions = [tokenizer.encode(instructions, add_special_tokens=True)[:max_size]]
-
- input_ids = torch.tensor(tokenized_instructions)
- # print(input_ids.shape)
- with torch.no_grad():
- outputs = model(input_ids.to("cuda"))
- if "gpt" in model_name:
- embeddings = outputs.last_hidden_state[0, :].detach().cpu().numpy()
- else:
- embeddings = outputs.last_hidden_state[:, 0, :].detach().cpu().numpy()
- # print(embeddings.shape)
- output_filename = '{}.npy'.format(datapoint['id'])
- output_path = os.path.join(output_dir, model_name, output_filename)
- os.makedirs(os.path.dirname(output_path), exist_ok=True)
- np.save(output_path, embeddings)
|