You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

web_stain_sample.py 11KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296
  1. import enum
  2. import json
  3. import time
  4. from io import BytesIO
  5. from urllib.request import Request, urlopen
  6. import cv2
  7. import numpy as np
  8. from PIL import Image
  9. from tifffile import TiffWriter
  10. from database_crawlers.utils import find_in_log_n, fetch_tile_content, download_urls_in_thread
  11. class StainType(enum.Enum):
  12. H_AND_E = 0, "H&E"
  13. UNKNOWN = 1, "UNKNOWN"
  14. class ThyroidCancerLevel(enum.Enum):
  15. UNKNOWN = -1, "UNKNOWN"
  16. MALIGNANT = 0, "MALIGNANT"
  17. BENIGN = 1, "BENIGN"
  18. @staticmethod
  19. def get_thyroid_level_from_diagnosis_label(label: str):
  20. label = label.lower()
  21. if "malignant" in label:
  22. return ThyroidCancerLevel.MALIGNANT
  23. elif "benign" in label:
  24. return ThyroidCancerLevel.BENIGN
  25. else:
  26. return ThyroidCancerLevel.UNKNOWN
  27. class ThyroidType(enum.Enum):
  28. UNKNOWN = -1, "UNKNOWN"
  29. NORMAL = 0, "NORMAL"
  30. PAPILLARY_CARCINOMA = 1, "PAPILLARY_CARCINOMA"
  31. GRAVES_DISEASE = 2, "GRAVES_DISEASE"
  32. NODULAR_GOITER = 3, "NODULAR_GOITER"
  33. HASHIMOTO_THYROIDITIS = 4, "HASHIMOTO_THYROIDITIS"
  34. FOLLICULAR_CARCINOMA = 5, "FOLLICULAR_CARCINOMA"
  35. FOLLICULAR_ADENOMA = 6, "FOLLICULAR_ADENOMA"
  36. COLLOID_GOITER = 7, "COLLOID_GOITER"
  37. @staticmethod
  38. def get_thyroid_type_from_diagnosis_label(label: str):
  39. label = label.lower()
  40. if "normal" in label:
  41. return ThyroidType.NORMAL
  42. elif "papillary" in label:
  43. return ThyroidType.PAPILLARY_CARCINOMA
  44. elif "grave" in label:
  45. return ThyroidType.GRAVES_DISEASE
  46. elif "nodular" in label and "goiter" in label:
  47. return ThyroidType.NODULAR_GOITER
  48. elif "hashimoto" in label:
  49. return ThyroidType.HASHIMOTO_THYROIDITIS
  50. elif "follicular" in label:
  51. if "adenoma" in label:
  52. return ThyroidType.FOLLICULAR_ADENOMA
  53. else:
  54. return ThyroidType.FOLLICULAR_CARCINOMA
  55. elif "colloid" in label and "goiter" in label:
  56. return ThyroidType.COLLOID_GOITER
  57. else:
  58. return ThyroidType.UNKNOWN
  59. class WebStainImage:
  60. save_path = "data/"
  61. def __init__(self, database_name, image_id, image_web_label, report, stain_type, is_wsi):
  62. self.database_name = database_name
  63. self.image_id = image_id
  64. self.image_web_label = image_web_label
  65. self.report = report
  66. self.stain_type = stain_type
  67. self.is_wsi = is_wsi
  68. def to_json(self):
  69. return {"database_name": self.database_name,
  70. "image_id": self.image_id,
  71. "image_web_label": self.image_web_label,
  72. "image_class_label": self.image_class_label,
  73. "report": self.report,
  74. "stain_type": self.stain_type.value[1],
  75. "is_wsi": self.is_wsi}
  76. @staticmethod
  77. def sorted_json_keys():
  78. return ["database_name",
  79. "image_id",
  80. "image_web_label",
  81. "image_class_label",
  82. "report",
  83. "stain_type",
  84. "is_wsi"]
  85. @property
  86. def image_class_label(self):
  87. return ThyroidType.get_thyroid_type_from_diagnosis_label(self.image_web_label).value[1]
  88. def get_slide_view_url(self):
  89. raise NotImplemented("get_slide_view_url")
  90. def crawl_image_save_jpeg_and_json(self):
  91. raise NotImplemented("crawl_image_get_jpeg")
  92. def _get_file_path_name(self):
  93. return self.save_path + self.image_id
  94. def _get_relative_image_path(self):
  95. return self._get_file_path_name() + ".jpeg"
  96. def _get_relative_tiff_image_path(self):
  97. return self._get_file_path_name() + ".tiff"
  98. def _get_relative_json_path(self):
  99. return self._get_file_path_name() + ".json"
  100. def _save_json_file(self):
  101. json_object = json.dumps(self.to_json())
  102. with open(self._get_relative_json_path(), "w") as outfile:
  103. outfile.write(json_object)
  104. class WebStainWSI(WebStainImage):
  105. def __init__(self, database_name, image_id, image_web_label, report, stain_type, is_wsi):
  106. super().__init__(database_name, image_id, image_web_label, report, stain_type, is_wsi)
  107. def _get_tile_url(self, zoom, partition=None, i=None, j=None):
  108. raise NotImplemented("_get_tile_url")
  109. def _generate_tile_urls(self):
  110. raise NotImplemented("generate tile urls")
  111. def find_best_zoom(self):
  112. return 0
  113. def _find_first_tile_width(self):
  114. image_content = fetch_tile_content(self._get_tile_url(self.find_best_zoom(), partition=0, i=0, j=0))
  115. img = Image.open(BytesIO(image_content))
  116. return img.size[0], img.size[1]
  117. def _fetch_all_tiles(self):
  118. batch = []
  119. index = 0
  120. for url in self._generate_tile_urls():
  121. batch.append((url, index))
  122. # DONE
  123. index += 1
  124. # download last batch
  125. if len(batch) != 0:
  126. for content, downloaded_index in download_urls_in_thread(batch):
  127. yield content, downloaded_index
  128. print("Slide download tiles done!!!")
  129. def crawl_image_save_jpeg_and_json(self):
  130. raise NotImplemented("crawl_image_save_jpeg_and_json")
  131. class WebStainWSIOneDIndex(WebStainWSI):
  132. def __init__(self, database_name, image_id, image_web_label, report, stain_type, is_wsi):
  133. super().__init__(database_name, image_id, image_web_label, report, stain_type, is_wsi)
  134. self.last_partition = None
  135. def _find_last_partition(self):
  136. print("Finding last partition: ", end="")
  137. def func(partition, retry=3):
  138. print(partition, end="")
  139. for i in range(retry):
  140. try:
  141. request = Request(self._get_tile_url(self.find_best_zoom(), partition=partition), method='HEAD')
  142. resp = urlopen(request)
  143. headers = resp.info()
  144. print("<", end=", ")
  145. return True
  146. except Exception as e:
  147. print("e", end="")
  148. time.sleep(2 ** (0.1 * (i + 1)))
  149. print(">", end=", ")
  150. return False
  151. return find_in_log_n(0, 1000 * 1000, func)
  152. def _generate_tile_urls(self):
  153. for partition in range(self.last_partition + 1):
  154. yield self._get_tile_url(self.find_best_zoom(), partition=partition)
  155. def crawl_image_save_jpeg_and_json(self):
  156. def generator():
  157. while True:
  158. if first_temp_rows:
  159. yield first_temp_rows[0]
  160. del first_temp_rows[0]
  161. else:
  162. res = next(content_fetcher, -1)
  163. if res == -1:
  164. break
  165. img = cv2.imdecode(np.frombuffer(res[0], np.uint8), -1)
  166. if len(img.shape) == 2:
  167. img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
  168. yield img
  169. first_image_width, first_image_height = self._find_first_tile_width()
  170. first_temp_rows = []
  171. column_tiles, row_tiles = None, None
  172. self.last_partition = self._find_last_partition()
  173. content_fetcher = self._fetch_all_tiles()
  174. with TiffWriter(self._get_relative_tiff_image_path(), bigtiff=True) as tif:
  175. while column_tiles is None:
  176. content, index = content_fetcher.__next__()
  177. image_array = cv2.imdecode(np.frombuffer(content, np.uint8), cv2.IMREAD_COLOR)
  178. first_temp_rows.append(image_array)
  179. if image_array.shape[1] != first_image_width:
  180. column_tiles = index + 1
  181. row_tiles = (self.last_partition + 1) // column_tiles
  182. shape = (first_image_height * row_tiles, first_image_width * column_tiles, 3)
  183. tif.write(generator(), subfiletype=1, tile=(first_image_height, first_image_width), shape=shape,
  184. dtype=np.uint8,
  185. compression='JPEG', # TODO
  186. photometric='rgb')
  187. """
  188. Save json file
  189. """
  190. self._save_json_file()
  191. class WebStainWSITwoDIndex(WebStainWSI):
  192. def __init__(self, database_name, image_id, image_web_label, report, stain_type, is_wsi):
  193. super().__init__(database_name, image_id, image_web_label, report, stain_type, is_wsi)
  194. self.last_i = None
  195. self.last_j = None
  196. def _generate_tile_urls(self):
  197. for j in range(self.last_j + 1):
  198. for i in range(self.last_i + 1):
  199. yield self._get_tile_url(self.find_best_zoom(), i=i, j=j)
  200. def _find_last_i_and_j(self):
  201. def func(i, j, retry=3):
  202. print(f"{i}-{j}", end="")
  203. for r in range(retry):
  204. try:
  205. request = Request(self._get_tile_url(self.find_best_zoom(), i=i, j=j), method='HEAD')
  206. resp = urlopen(request)
  207. headers = resp.info()
  208. print("<", end=", ")
  209. return True
  210. except Exception as e:
  211. print("e", end="")
  212. time.sleep(2 ** (0.1 * (r + 1)))
  213. print(">", end=", ")
  214. return False
  215. print("Finding last i: ", end="")
  216. i_func = lambda i: func(i=i, j=0)
  217. last_i = find_in_log_n(0, 1000, i_func)
  218. print("\nFinding last j: ")
  219. j_func = lambda j: func(i=0, j=j)
  220. last_j = find_in_log_n(0, 1000, j_func)
  221. return last_i, last_j
  222. def crawl_image_save_jpeg_and_json(self):
  223. def generator():
  224. while True:
  225. res = next(content_fetcher, -1)
  226. if res == -1:
  227. break
  228. res = cv2.imdecode(np.frombuffer(res[0], np.uint8), -1)
  229. if max(res.shape) >= 260:
  230. raise Exception(f"warning shape: {res.shape}")
  231. res = cv2.resize(res, (min(res.shape[1], 256), min(res.shape[0], 256)))
  232. yield res
  233. first_image_width = 256
  234. first_image_height = 256
  235. self.last_i, self.last_j = self._find_last_i_and_j()
  236. content_fetcher = self._fetch_all_tiles()
  237. with TiffWriter(self._get_relative_tiff_image_path(), bigtiff=True) as tif:
  238. shape = (first_image_height * (self.last_j + 1), first_image_width * (self.last_i + 1), 3)
  239. tif.write(generator(), subfiletype=1,
  240. tile=(first_image_height, first_image_width),
  241. shape=shape,
  242. dtype=np.uint8,
  243. compression='JPEG', # TODO
  244. photometric='rgb')
  245. """
  246. Save json file
  247. """
  248. self._save_json_file()