from typing import List, Tuple import argparse from os import makedirs, path, listdir import numpy as np import cv2 from multiprocessing import Pool def resize_image(img_dir: str, img_save_dir: str, res: int) -> None: img = cv2.imread(img_dir) img = cv2.resize(img, dsize=(res, res)) cv2.imwrite(img_save_dir, img) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('kaggle_dataset_dir', type=str, help='download the dataset from https://www.kaggle.com/c/rsna-pneumonia-detection-challenge/data, extract it, and pass its path is this argument') parser.add_argument('resolution', type=int, help='The resolution required for your model, 224 for resnet, 299 for vanilla inception, 256 for modified inception') parser.add_argument('cores', type=int, help='The number of cores for multiprocessing.') args = parser.parse_args() save_dir = f'data/RSNA-Kaggle_R{args.resolution}' makedirs(save_dir, exist_ok=True) kaggle_dataset_dir = args.kaggle_dataset_dir assert path.exists(kaggle_dataset_dir), f'{kaggle_dataset_dir} does not exist!' # reading rsna images names rsna_imgs_path = path.join(kaggle_dataset_dir, 'stage_2_train_images') assert path.exists(rsna_imgs_path), 'Make sure there is a folder named stage_2_train_images in the passed kaggle_directory!' imgs_names = np.asarray(listdir(rsna_imgs_path)) imgs_src = np.vectorize(lambda x: path.join(rsna_imgs_path, x))(imgs_names) imgs_dst = np.vectorize(lambda x: path.join(save_dir, x))(imgs_names) pool = Pool(args.cores) pool.starmap(resize_image, zip(imgs_src, imgs_dst, np.full((len(imgs_src),), args.resolution))) pool.close()