You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

database_crawler.py 2.4KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061
  1. import time
  2. from urllib.parse import urlparse
  3. from urllib.request import urlopen
  4. from bs4 import BeautifulSoup
  5. from database_crawlers.web_stain_sample import StainType, WebStainWSITwoDIndex
  6. class HeidelbergPathologyImage(WebStainWSITwoDIndex):
  7. def __init__(self, database_name, image_id, image_web_label, report, stain_type, is_wsi):
  8. super().__init__(database_name, image_id, image_web_label, report, stain_type, is_wsi)
  9. def _get_tile_url(self, zoom, partition=None, i=None, j=None):
  10. return f"https://eliph.klinikum.uni-heidelberg.de/dzi/atlas/05-schilddruese/05-{'%.2d' % int(self.image_id)}_files/{zoom}/{i}_{j}.jpeg"
  11. def get_slide_view_url(self):
  12. return f"https://eliph.klinikum.uni-heidelberg.de/atlas/?c=05-schilddruese&context=image&pg={self.image_id}"
  13. def _get_file_path_name(self):
  14. return self.save_path + self.image_id
  15. def find_best_zoom(self):
  16. # 16 -> 0
  17. return 16
  18. class HeidelbergPathologyProvider:
  19. page_link = "https://eliph.klinikum.uni-heidelberg.de/atlas/?c=05-schilddruese&context=image"
  20. database_name = "HeidelbergPathology"
  21. stain_type = StainType.H_AND_E
  22. is_wsi = True
  23. @classmethod
  24. def get_web_stain_samples(cls):
  25. print(cls.page_link)
  26. try:
  27. html_text = urlopen(cls.page_link).read()
  28. soup = BeautifulSoup(html_text, 'html.parser')
  29. search_results = soup.find_all("div", {"class": "casegrid"})
  30. for result_item in search_results:
  31. image_view_url = result_item.find("a").attrs['href']
  32. query_param = urlparse(image_view_url).query.split("=")
  33. if "image&pg" not in query_param: raise Exception("Query params does not contains image id")
  34. image_id = query_param[-1]
  35. image_web_label = str(result_item.find("b").next)
  36. yield HeidelbergPathologyImage(cls.database_name, image_id, image_web_label, None,
  37. cls.stain_type, cls.is_wsi)
  38. except Exception as e:
  39. print(e)
  40. time.sleep(2)
  41. yield cls.get_web_stain_samples()
  42. if __name__ == '__main__':
  43. bio_atlas_provider = HeidelbergPathologyProvider()
  44. for slide in bio_atlas_provider.get_web_stain_samples():
  45. print(slide.image_id, slide.image_web_label, slide.get_slide_view_url())
  46. slide.crawl_image_save_jpeg_and_json()
  47. break