You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

database_crawler.py 2.6KB

2 years ago
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364
  1. import json
  2. from urllib.parse import urlparse
  3. from urllib.request import urlretrieve
  4. import requests
  5. from bs4 import BeautifulSoup
  6. from database_crawlers.web_stain_sample import WebStainImage, StainType
  7. class StanfordTissueMicroArrayStainSample(WebStainImage):
  8. def __init__(self, database_name, image_id, image_web_label, report, stain_type, is_wsi):
  9. super().__init__(database_name, image_id, image_web_label, report, stain_type, is_wsi)
  10. def get_slide_view_url(self):
  11. return f"https://storage.googleapis.com/jpg.tma.im/{self.image_id}"
  12. def get_file_name(self):
  13. image_raw_id = self.image_id.replace("/", "_")
  14. image_raw_id = ".".join(image_raw_id.split(".")[:len(image_raw_id.split(".")) - 1])
  15. return self.save_path + image_raw_id
  16. def get_relative_image_path(self):
  17. return self.get_file_name() + ".jpeg"
  18. def get_relative_json_path(self):
  19. return self.get_file_name() + ".json"
  20. def crawl_image_save_jpeg(self):
  21. urlretrieve(self.get_slide_view_url(), self.get_relative_image_path())
  22. json_object = json.dumps(self.to_json())
  23. with open(self.get_relative_json_path(), "w") as outfile:
  24. outfile.write(json_object)
  25. class StanfordTissueMicroArraySlideProvider:
  26. page_link = "https://tma.im/cgi-bin/selectImages.pl?organ=thyroid"
  27. database_name = "StanfordTissueMicroArray"
  28. stain_type = StainType.UNKNOWN
  29. is_wsi = False
  30. @classmethod
  31. def get_web_stain_samples(cls):
  32. payload = {'250 small images': '250 small images'}
  33. files = []
  34. headers = {
  35. 'Cookie': 'DAD_ATTEMPTS=0; DAD_SID=36d77eb69e009b1cf1ebc9c3d7866546; DAD_USERID=WORLD'
  36. }
  37. html_text = requests.post(cls.page_link, files=files, headers=headers, data=payload).content.decode("utf-8")
  38. soup = BeautifulSoup(html_text, 'html.parser')
  39. search_results = soup.find_all("div", {"class": "iDiv0", "style": "width: 86px; height: 260px;"})
  40. for result_item in search_results:
  41. image_url = result_item.find("a", {"target": "_blank"}).attrs['href']
  42. image_id = "/".join(urlparse(image_url).path.strip("/").split("/")[1:])
  43. image_web_label = list(result_item.find_all("p", {"class": "iDiv1"}))[-2].text
  44. yield StanfordTissueMicroArrayStainSample(cls.database_name, image_id, image_web_label, None,
  45. cls.stain_type, cls.is_wsi)
  46. if __name__ == '__main__':
  47. for slide in StanfordTissueMicroArraySlideProvider.get_web_stain_samples():
  48. print(slide.image_id, slide.image_web_label, slide.get_slide_view_url())
  49. slide.crawl_image_save_jpeg()