Official implementation of the Fake News Revealer paper
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

data_loader.py 2.1KB

2 years ago
2 years ago
1 year ago
2 years ago
1 year ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859
  1. import torch
  2. from torch.utils.data import Dataset
  3. import albumentations as A
  4. from transformers import ViTFeatureExtractor, AutoTokenizer
  5. from transformers import BertTokenizer, RobertaTokenizer, XLNetTokenizer
  6. def get_transforms(config):
  7. return A.Compose(
  8. [
  9. A.Resize(config.size, config.size, always_apply=True),
  10. A.Normalize(max_pixel_value=255.0, always_apply=True),
  11. ]
  12. )
  13. def get_tokenizer(config):
  14. if 'roberta' in config.text_encoder_model:
  15. tokenizer = AutoTokenizer.from_pretrained(config.text_tokenizer)
  16. elif 'xlnet' in config.text_encoder_model:
  17. tokenizer = XLNetTokenizer.from_pretrained(config.text_tokenizer)
  18. else:
  19. tokenizer = BertTokenizer.from_pretrained(config.text_tokenizer)
  20. return tokenizer
  21. class DatasetLoader(Dataset):
  22. def __init__(self, config, dataframe, mode):
  23. self.config = config
  24. self.mode = mode
  25. self.image_filenames = dataframe["image"].values
  26. self.text = list(dataframe["text"].values)
  27. self.labels = dataframe["label"].values
  28. tokenizer = get_tokenizer(config)
  29. self.encoded_text = tokenizer(self.text, padding=True, truncation=True, max_length=config.max_length, return_tensors='pt')
  30. if 'resnet' in config.image_model_name:
  31. self.transforms = get_transforms(config)
  32. else:
  33. self.transforms = ViTFeatureExtractor.from_pretrained(config.image_model_name)
  34. def set_text(self, idx):
  35. item = {
  36. key: values[idx].clone().detach()
  37. for key, values in self.encoded_text.items()
  38. }
  39. item['text'] = self.text[idx]
  40. item['label'] = self.labels[idx]
  41. item['id'] = idx
  42. return item
  43. def set_image(self, image):
  44. if 'resnet' in self.config.image_model_name:
  45. image = self.transforms(image=image)['image']
  46. return {'image': torch.as_tensor(image).reshape((3, 224, 224))}
  47. else:
  48. image = self.transforms(images=image, return_tensors='pt')
  49. image = image.convert_to_tensors(tensor_type='pt')['pixel_values']
  50. return {'image': image.reshape((3, 224, 224))}