You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long. 2.6KB

  1. import json
  2. from urllib.parse import urlparse
  3. from urllib.request import urlretrieve
  4. import requests
  5. from bs4 import BeautifulSoup
  6. from database_crawlers.web_stain_sample import WebStainImage, StainType
  7. class StanfordTissueMicroArrayStainSample(WebStainImage):
  8. def __init__(self, database_name, image_id, image_web_label, report, stain_type, is_wsi):
  9. super().__init__(database_name, image_id, image_web_label, report, stain_type, is_wsi)
  10. def get_slide_view_url(self):
  11. return f"{self.image_id}"
  12. def get_file_name(self):
  13. image_raw_id = self.image_id.replace("/", "_")
  14. image_raw_id = ".".join(image_raw_id.split(".")[:len(image_raw_id.split(".")) - 1])
  15. return self.save_path + image_raw_id
  16. def get_relative_image_path(self):
  17. return self.get_file_name() + ".jpeg"
  18. def get_relative_json_path(self):
  19. return self.get_file_name() + ".json"
  20. def crawl_image_save_jpeg(self):
  21. urlretrieve(self.get_slide_view_url(), self.get_relative_image_path())
  22. json_object = json.dumps(self.to_json())
  23. with open(self.get_relative_json_path(), "w") as outfile:
  24. outfile.write(json_object)
  25. class StanfordTissueMicroArraySlideProvider:
  26. page_link = ""
  27. database_name = "StanfordTissueMicroArray"
  28. stain_type = StainType.UNKNOWN
  29. is_wsi = False
  30. @classmethod
  31. def get_web_stain_samples(cls):
  32. payload = {'250 small images': '250 small images'}
  33. files = []
  34. headers = {
  35. 'Cookie': 'DAD_ATTEMPTS=0; DAD_SID=36d77eb69e009b1cf1ebc9c3d7866546; DAD_USERID=WORLD'
  36. }
  37. html_text =, files=files, headers=headers, data=payload).content.decode("utf-8")
  38. soup = BeautifulSoup(html_text, 'html.parser')
  39. search_results = soup.find_all("div", {"class": "iDiv0", "style": "width: 86px; height: 260px;"})
  40. for result_item in search_results:
  41. image_url = result_item.find("a", {"target": "_blank"}).attrs['href']
  42. image_id = "/".join(urlparse(image_url).path.strip("/").split("/")[1:])
  43. image_web_label = list(result_item.find_all("p", {"class": "iDiv1"}))[-2].text
  44. yield StanfordTissueMicroArrayStainSample(cls.database_name, image_id, image_web_label, None,
  45. cls.stain_type, cls.is_wsi)
  46. if __name__ == '__main__':
  47. for slide in StanfordTissueMicroArraySlideProvider.get_web_stain_samples():
  48. print(slide.image_id, slide.image_web_label, slide.get_slide_view_url())
  49. slide.crawl_image_save_jpeg()