You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

database_crawler.py 2.8KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364
  1. import ssl
  2. import time
  3. from urllib.parse import urlparse
  4. from urllib.request import urlopen
  5. from bs4 import BeautifulSoup
  6. from database_crawlers.web_stain_sample import StainType, WebStainWSIOneDIndex
  7. ssl._create_default_https_context = ssl._create_unverified_context
  8. class BioAtlasAtJakeGittlenLaboratoriesImage(WebStainWSIOneDIndex):
  9. def __init__(self, database_name, image_id, image_web_label, report, stain_type, is_wsi):
  10. super().__init__(database_name, image_id, image_web_label, report, stain_type, is_wsi)
  11. def _get_tile_url(self, zoom, partition=None, i=None, j=None):
  12. return f"https://bio-atlas.psu.edu/human/tile.jpeg.php?s={self.image_id}&z={zoom}&i={partition}"
  13. def get_slide_view_url(self):
  14. return f"https://bio-atlas.psu.edu/human/view.php?s={self.image_id}"
  15. def _get_file_path_name(self):
  16. return self.save_path + self.image_id
  17. def find_best_zoom(self):
  18. return 0
  19. class BioAtlasThyroidSlideProvider:
  20. page_link = "https://bio-atlas.psu.edu/human/search.php?q=Thyroid&organism%5B%5D=5&age_fr=&age_fr_units=1&age_to=&age_to_units=1&sex%5B%5D=all&thumbnails=on&rpp=30&as_sfid=AAAAAAW0RrspdnblpiFwz8osoAdvS8nafd1J9LG_ARQ-IF_NZ3aI2EXCMDBeqE_iD5rUo1QLg454tS63DMSgATSzgrksb4rMi-GWPl3O9f3JKlqGn8oXoqbOYok3__yZx69ewzg%3D&as_fid=6900aeb3e4cc9f39ef9738a2f11c2cefb8c3f37c#results"
  21. database_name = "BioAtlasThyroidSlideProvider"
  22. stain_type = StainType.H_AND_E
  23. is_wsi = True
  24. @classmethod
  25. def get_web_stain_samples(cls):
  26. print(cls.page_link)
  27. try:
  28. html_text = urlopen(cls.page_link).read()
  29. soup = BeautifulSoup(html_text, 'html.parser')
  30. search_results = soup.find_all("div", {"class": "shadow-box search-result-item search-result-slide"})
  31. for result_item in search_results:
  32. image_view_url = result_item.find("a").attrs['href']
  33. query_param = urlparse(image_view_url).query.split("=")
  34. if query_param[0] != "s": raise Exception("Query params does not contains image url")
  35. image_id = query_param[1]
  36. image_web_label = str(result_item.find("b", text="Diagnosis").next_sibling)
  37. yield BioAtlasAtJakeGittlenLaboratoriesImage(cls.database_name, image_id, image_web_label, None,
  38. cls.stain_type, cls.is_wsi)
  39. except Exception as e:
  40. print(e)
  41. time.sleep(2)
  42. yield cls.get_web_stain_samples()
  43. if __name__ == '__main__':
  44. bio_atlas_provider = BioAtlasThyroidSlideProvider()
  45. for slide in bio_atlas_provider.get_web_stain_samples():
  46. if slide.image_id == "687":
  47. print(slide.image_id, slide.image_web_label, slide.get_slide_view_url())
  48. slide.crawl_image_save_jpeg_and_json()
  49. break