You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

tokenizerman.py 479B

1234567891011121314
  1. from transformers import BertTokenizerFast, DataCollatorWithPadding
  2. class TokenizerMan:
  3. def __init__(self, tokenizer_kind: str, pretrained_name: str):
  4. if tokenizer_kind == 'bert':
  5. self.tokenizer = BertTokenizerFast.from_pretrained(pretrained_name)
  6. else:
  7. raise Exception('Not implemented!')
  8. def get_col_fn(self):
  9. return DataCollatorWithPadding(
  10. self.tokenizer, return_tensors='pt', padding='longest'
  11. )