Browse Source

first commit

master
commit
c54381d025
55 changed files with 5261 additions and 0 deletions
  1. 145
    0
      .gitignore
  2. BIN
      DigiCertHighAssuranceEVCA-1.crt
  3. 39
    0
      DigiCertHighAssuranceEVCA-1.pem
  4. 21
    0
      LICENSE
  5. 3
    0
      README.md
  6. 0
    0
      __init__.py
  7. BIN
      classifications_efforts/test_methods/test_fda/amplitude_dir/0.npy
  8. BIN
      classifications_efforts/test_methods/test_fda/amplitude_dir/1.npy
  9. BIN
      classifications_efforts/test_methods/test_fda/amplitude_dir/2.npy
  10. BIN
      classifications_efforts/test_methods/test_fda/amplitude_dir/3.npy
  11. BIN
      classifications_efforts/test_methods/test_fda/amplitude_dir/4.npy
  12. 19
    0
      classifications_efforts/test_methods/test_fda/fda.py
  13. 62
    0
      classifications_efforts/test_methods/test_model_for_bengin_malignant/albumentations_mixup.py
  14. 162
    0
      classifications_efforts/test_methods/test_model_for_bengin_malignant/fragment_splitter.py
  15. 20
    0
      classifications_efforts/test_methods/test_model_for_bengin_malignant/model_train_logger.py
  16. 399
    0
      classifications_efforts/test_methods/test_model_for_bengin_malignant/model_training.py
  17. 3
    0
      classifications_efforts/test_methods/test_model_for_bengin_malignant/run_splitter.sh
  18. 3
    0
      classifications_efforts/test_methods/test_model_for_bengin_malignant/run_test_models.sh
  19. 83
    0
      classifications_efforts/test_methods/test_model_for_bengin_malignant/thyroid_dataset.py
  20. 47
    0
      classifications_efforts/test_methods/test_model_for_bengin_malignant/thyroid_ml_model.py
  21. BIN
      classifications_efforts/test_methods/test_model_for_bengin_malignant/train_state/fda-mixup-std-benign-malignant on stanford-papsociety.rar
  22. BIN
      classifications_efforts/test_methods/test_model_for_bengin_malignant/train_state/min-benign-malignant on stanford-papsociety.rar
  23. 82
    0
      classifications_efforts/test_methods/test_model_for_bengin_malignant/transformation.py
  24. 45
    0
      config.py
  25. 0
    0
      database_crawlers/__init__.py
  26. 0
    0
      database_crawlers/bio_atlas_at_jake_gittlen_laboratories/__init__.py
  27. 64
    0
      database_crawlers/bio_atlas_at_jake_gittlen_laboratories/database_crawler.py
  28. 3
    0
      database_crawlers/bio_atlas_at_jake_gittlen_laboratories/run_bio_atlas_crawler.sh
  29. 61
    0
      database_crawlers/heidelberg_pathology/database_crawler.py
  30. 7
    0
      database_crawlers/image_patcher/bio_atlas_patcher.py
  31. 287
    0
      database_crawlers/image_patcher/image_patcher.py
  32. 60
    0
      database_crawlers/image_patcher/national_cancer_patcher.py
  33. 4
    0
      database_crawlers/image_patcher/run_bio_atlas_patcher.sh
  34. 4
    0
      database_crawlers/image_patcher/run_image_patcher.sh
  35. 4
    0
      database_crawlers/image_patcher/run_national_image_patcher.sh
  36. 280
    0
      database_crawlers/image_patcher/test_image_patcher/nci_svs_and_masks/evaluate_image_patcher_and_visualize.py
  37. 6
    0
      database_crawlers/image_patcher/test_image_patcher/nci_svs_and_masks/run_patch_evaluator_and_visualizer.sh
  38. 4
    0
      database_crawlers/image_patcher/test_image_patcher/run_patch_distribution.sh
  39. 18
    0
      database_crawlers/national_cancer_institute/cells_chart.py
  40. BIN
      database_crawlers/national_cancer_institute/gdc-client
  41. 1147
    0
      database_crawlers/national_cancer_institute/gdc_manifest_20220701_140911.txt
  42. 65
    0
      database_crawlers/national_cancer_institute/patch_distribution.py
  43. 26
    0
      database_crawlers/national_cancer_institute/read_xml_file.py
  44. 4
    0
      database_crawlers/national_cancer_institute/run_cell_distribution.sh
  45. 5
    0
      database_crawlers/national_cancer_institute/run_patch_distribution.sh
  46. 29
    0
      database_crawlers/papsociaty/duplicate_image.txt
  47. 13
    0
      database_crawlers/papsociaty/remove_duplicate_patches.py
  48. 1319
    0
      database_crawlers/rescale console log for papsociety and stanford.txt
  49. 64
    0
      database_crawlers/stanford_tissue_microarray/database_crawler.py
  50. 87
    0
      database_crawlers/utils.py
  51. 296
    0
      database_crawlers/web_stain_sample.py
  52. 20
    0
      datasets_sample_view/convert_to_jpeg.py
  53. 50
    0
      datasets_sample_view/dataset_sample_view.py
  54. 180
    0
      requirements.txt
  55. 21
    0
      utils.py

+ 145
- 0
.gitignore View File

# Custom
*.svs
*.xml
*.csv
*.xcf
*.zip
*.json
*.state
*.tiff
*.tif
.idea
*.jpeg
*.jpg
**/data/
**/patches/
classification_stuff/Transfer-Learning-Library
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
.python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

BIN
DigiCertHighAssuranceEVCA-1.crt View File


+ 39
- 0
DigiCertHighAssuranceEVCA-1.pem View File

-----BEGIN CERTIFICATE-----
MIIG5jCCBc6gAwIBAgIQAze5KDR8YKauxa2xIX84YDANBgkqhkiG9w0BAQUFADBs
MQswCQYDVQQGEwJVUzEVMBMGA1UEChMMRGlnaUNlcnQgSW5jMRkwFwYDVQQLExB3
d3cuZGlnaWNlcnQuY29tMSswKQYDVQQDEyJEaWdpQ2VydCBIaWdoIEFzc3VyYW5j
ZSBFViBSb290IENBMB4XDTA3MTEwOTEyMDAwMFoXDTIxMTExMDAwMDAwMFowaTEL
MAkGA1UEBhMCVVMxFTATBgNVBAoTDERpZ2lDZXJ0IEluYzEZMBcGA1UECxMQd3d3
LmRpZ2ljZXJ0LmNvbTEoMCYGA1UEAxMfRGlnaUNlcnQgSGlnaCBBc3N1cmFuY2Ug
RVYgQ0EtMTCCASIwDQYJKoZIhvcNAQEBBQADggEPADCCAQoCggEBAPOWYth1bhn/
PzR8SU8xfg0ETpmB4rOFVZEwscCvcLssqOcYqj9495BoUoYBiJfiOwZlkKq9ZXbC
7L4QWzd4g2B1Rca9dKq2n6Q6AVAXxDlpufFP74LByvNK28yeUE9NQKM6kOeGZrzw
PnYoTNF1gJ5qNRQ1A57bDIzCKK1Qss72kaPDpQpYSfZ1RGy6+c7pqzoC4E3zrOJ6
4GAiBTyC01Li85xH+DvYskuTVkq/cKs+6WjIHY9YHSpNXic9rQpZL1oRIEDZaARo
LfTAhAsKG3jf7RpY3PtBWm1r8u0c7lwytlzs16YDMqbo3rcoJ1mIgP97rYlY1R4U
pPKwcNSgPqcCAwEAAaOCA4UwggOBMA4GA1UdDwEB/wQEAwIBhjA7BgNVHSUENDAy
BggrBgEFBQcDAQYIKwYBBQUHAwIGCCsGAQUFBwMDBggrBgEFBQcDBAYIKwYBBQUH
AwgwggHEBgNVHSAEggG7MIIBtzCCAbMGCWCGSAGG/WwCATCCAaQwOgYIKwYBBQUH
AgEWLmh0dHA6Ly93d3cuZGlnaWNlcnQuY29tL3NzbC1jcHMtcmVwb3NpdG9yeS5o
dG0wggFkBggrBgEFBQcCAjCCAVYeggFSAEEAbgB5ACAAdQBzAGUAIABvAGYAIAB0
AGgAaQBzACAAQwBlAHIAdABpAGYAaQBjAGEAdABlACAAYwBvAG4AcwB0AGkAdAB1
AHQAZQBzACAAYQBjAGMAZQBwAHQAYQBuAGMAZQAgAG8AZgAgAHQAaABlACAARABp
AGcAaQBDAGUAcgB0ACAARQBWACAAQwBQAFMAIABhAG4AZAAgAHQAaABlACAAUgBl
AGwAeQBpAG4AZwAgAFAAYQByAHQAeQAgAEEAZwByAGUAZQBtAGUAbgB0ACAAdwBo
AGkAYwBoACAAbABpAG0AaQB0ACAAbABpAGEAYgBpAGwAaQB0AHkAIABhAG4AZAAg
AGEAcgBlACAAaQBuAGMAbwByAHAAbwByAGEAdABlAGQAIABoAGUAcgBlAGkAbgAg
AGIAeQAgAHIAZQBmAGUAcgBlAG4AYwBlAC4wEgYDVR0TAQH/BAgwBgEB/wIBADCB
gwYIKwYBBQUHAQEEdzB1MCQGCCsGAQUFBzABhhhodHRwOi8vb2NzcC5kaWdpY2Vy
dC5jb20wTQYIKwYBBQUHMAKGQWh0dHA6Ly93d3cuZGlnaWNlcnQuY29tL0NBQ2Vy
dHMvRGlnaUNlcnRIaWdoQXNzdXJhbmNlRVZSb290Q0EuY3J0MIGPBgNVHR8EgYcw
gYQwQKA+oDyGOmh0dHA6Ly9jcmwzLmRpZ2ljZXJ0LmNvbS9EaWdpQ2VydEhpZ2hB
c3N1cmFuY2VFVlJvb3RDQS5jcmwwQKA+oDyGOmh0dHA6Ly9jcmw0LmRpZ2ljZXJ0
LmNvbS9EaWdpQ2VydEhpZ2hBc3N1cmFuY2VFVlJvb3RDQS5jcmwwHQYDVR0OBBYE
FExYyyXwQU9S9CjIgUObpqig5pLlMB8GA1UdIwQYMBaAFLE+w2kD+L9HAdSYJhoI
Au9jZCvDMA0GCSqGSIb3DQEBBQUAA4IBAQBMeheHKF0XvLIyc7/NLvVYMR3wsXFU
nNabZ5PbLwM+Fm8eA8lThKNWYB54lBuiqG+jpItSkdfdXJW777UWSemlQk808kf/
roF/E1S3IMRwFcuBCoHLdFfcnN8kpCkMGPAc5K4HM+zxST5Vz25PDVR708noFUjU
xbvcNRx3RQdIRYW9135TuMAW2ZXNi419yWBP0aKb49Aw1rRzNubS+QOy46T15bg+
BEkAui6mSnKDcp33C4ypieez12Qf1uNgywPE3IjpnSUBAHHLA7QpYCWP+UbRe3Gu
zVMSW4SOwg/H7ZMZ2cn6j1g0djIvruFQFGHUqFijyDATI+/GJYw2jxyA
-----END CERTIFICATE-----

+ 21
- 0
LICENSE View File

MIT License

Copyright (c) 2022 Amir Hossein

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

+ 3
- 0
README.md View File

# Thyroid-Project-Using-DL

Developing a neural network to classify thyroid carcinoma using H&E slides

+ 0
- 0
__init__.py View File


BIN
classifications_efforts/test_methods/test_fda/amplitude_dir/0.npy View File


BIN
classifications_efforts/test_methods/test_fda/amplitude_dir/1.npy View File


BIN
classifications_efforts/test_methods/test_fda/amplitude_dir/2.npy View File


BIN
classifications_efforts/test_methods/test_fda/amplitude_dir/3.npy View File


BIN
classifications_efforts/test_methods/test_fda/amplitude_dir/4.npy View File


+ 19
- 0
classifications_efforts/test_methods/test_fda/fda.py View File

# from dalib.translation.fourier_transform import FourierTransform
import importlib.util
import sys

from PIL import Image

fourier_transform_address = "E:\\Documentwork\\sharif\\CE Project\\future\\Thyroid Project\\Thyroid-Project-Using-DL\\classification_stuff\\Transfer-Learning-Library\\dalib\\translation\\fourier_transform.py"
spec = importlib.util.spec_from_file_location("module.name", fourier_transform_address)
foo = importlib.util.module_from_spec(spec)
sys.modules["module.name"] = foo
spec.loader.exec_module(foo)
FourierTransform = foo.FourierTransform
image_list = ["bio_tile (1).jpeg", "bio_tile (2).jpeg", "bio_tile (3).jpeg", "bio_tile (4).jpeg", "bio_tile (4).jpeg"]
amplitude_dir = "amplitude_dir"
fourier_transform = FourierTransform(image_list, amplitude_dir, beta=0, rebuild=False)
source_image = Image.open("tile2.jpeg") # image form source domain
source_image_in_target_style = fourier_transform(source_image)

source_image_in_target_style.save("out_fda.jpeg")

+ 62
- 0
classifications_efforts/test_methods/test_model_for_bengin_malignant/albumentations_mixup.py View File

import random

import cv2
import torch
import torch.nn as nn
from albumentations.augmentations.utils import read_rgb_image
from albumentations.core.transforms_interface import BasicTransform, to_tuple


class Mixup(BasicTransform):
def __init__(self, mixups, read_fn=read_rgb_image, beta_limit=0.3, **kwargs):
super().__init__(**kwargs)
self.mixups = mixups
self.read_fn = read_fn
self.beta_limit = to_tuple(beta_limit, low=0)

def apply(self, image, mixup_image=None, beta=0.1, **params):
img_type = image.dtype
image = ((1 - beta) * image + beta * mixup_image).astype(img_type)
return image

def apply_to_target(self, target, beta=0.1, mixup_target=-1, **params):
target = {"img": target, "mixup": mixup_target, "beta": beta}
return target

def get_params_dependent_on_targets(self, params):
img = params["image"]
mixup = random.choice(self.mixups)
mixup_image = self.read_fn(mixup[0])
vertical_pad = max(0, (img.shape[0] - mixup_image.shape[0]) // 2)
horizontal_pad = max(0, (img.shape[1] - mixup_image.shape[1]) // 2)
try:
mixup_image = cv2.copyMakeBorder(mixup_image, vertical_pad, vertical_pad, horizontal_pad, horizontal_pad,
cv2.BORDER_REFLECT)
except Exception as e:
print(e)
mixup_image = cv2.resize(mixup_image, dsize=(img.shape[1], img.shape[0]))
return {"mixup_image": mixup_image, "mixup_target": mixup[1]}

def get_params(self):
return {"beta": random.uniform(self.beta_limit[0], self.beta_limit[1])}

@property
def targets(self):
return {
"image": self.apply,
"target": self.apply_to_target,
}

@property
def targets_as_params(self):
return ["image"]


def mixup_loss(output, target):
if type(target) == torch.Tensor:
loss = nn.CrossEntropyLoss()
return loss(output, target)
else: # mixup has been used
loss = nn.CrossEntropyLoss(reduction="none")
return ((1 - target["beta"]) * loss(output, target["img"]) + target["beta"] * loss(output,
target["mixup"])).mean()

+ 162
- 0
classifications_efforts/test_methods/test_model_for_bengin_malignant/fragment_splitter.py View File

import csv
import glob
import os
import random

from tqdm import tqdm

from config import Config


class CustomFragmentLoader:
def __init__(self, datasets_folder_name):
self._datasets_folder_name = datasets_folder_name
self._database_slide_dict = {}
self._load_csv_files_to_dict()

def _load_csv_files_to_dict(self):
databases_directory = "../../../database_crawlers/"
list_dir = [os.path.join(databases_directory, o, "patches") for o in self._datasets_folder_name
if os.path.isdir(os.path.join(databases_directory, o, "patches"))]
for db_dir in list_dir:
csv_dir = os.path.join(db_dir, "patch_labels.csv")
with open(csv_dir, "r") as csv_file:
csv_reader = csv.reader(csv_file)
header = next(csv_reader, None)
for row in csv_reader:
if row:
database_id = row[0]
image_id = row[1]
slide_frag_folder_name = [o for o in os.listdir(db_dir) if image_id.startswith(o)]
if slide_frag_folder_name:
slide_frag_folder_name = slide_frag_folder_name[0]
else:
continue
slide_path = os.path.join(db_dir, slide_frag_folder_name)
image_paths = glob.glob(os.path.join(slide_path, "*.jpeg"))
if image_paths:
d = self._database_slide_dict.get(database_id, {})
d[image_id] = [image_paths] + [row[3], row[2]]
self._database_slide_dict[database_id] = d

def load_image_path_and_labels_and_split(self, test_percent=20, val_percent=10):
train_images, val_images, test_images = [], [], []
for database_name, slides_dict in self._database_slide_dict.items():
image_paths_by_slide = [(len(v[0]), v[0], v[1], v[2]) for v in slides_dict.values()]
random.shuffle(image_paths_by_slide)
# image_paths_by_slide.sort()
class_slides_dict = {}
for item in image_paths_by_slide:
class_name = None
if database_name == "NationalCancerInstitute":
normal_percent = int(item[2].strip(r"(|)|\'").split("\', \'")[0])
tumor_percent = int(item[2].strip(r"(|)|\'").split("\', \'")[1])
stormal_percent = int(item[2].strip(r"(|)|\'").split("\', \'")[2])
if stormal_percent == 0:
if tumor_percent == 100:
class_name = "MALIGNANT"
elif normal_percent == 100:
class_name = "BENIGN"
else:
class_name = str(tumor_percent)
elif database_name == "BioAtlasThyroidSlideProvider":
if "papillary" in item[3].lower():
class_name = "MALIGNANT"
elif "normal" in item[3].lower():
class_name = "BENIGN"
class_name = class_name if class_name else item[2]
if class_name in Config.class_names:
class_slides_dict[class_name] = class_slides_dict.get(class_name, []) + [
(item[0], item[1], class_name)]

# split test val train because they must not share same slide id fragment

for thyroid_class, slide_frags in class_slides_dict.items():
dataset_train_images, dataset_val_images, dataset_test_images = [], [], []
total_counts = sum([item[0] for item in slide_frags])
test_counts = total_counts * test_percent // 100
val_counts = total_counts * val_percent // 100
train_counts = total_counts - test_counts - val_counts
for i, slide_frags_item in enumerate(slide_frags):
if len(dataset_train_images) + slide_frags_item[0] <= train_counts:
dataset_train_images += slide_frags_item[1]
elif len(dataset_val_images) + slide_frags_item[0] <= val_counts:
dataset_val_images += slide_frags_item[1]
else:
dataset_test_images += slide_frags_item[1]
train_images += [(i, thyroid_class) for i in dataset_train_images]
val_images += [(i, thyroid_class) for i in dataset_val_images]
test_images += [(i, thyroid_class) for i in dataset_test_images]

return train_images, val_images, test_images

def national_cancer_image_and_labels_splitter_per_slide(self, test_percent=20, val_percent=10):
train_images, val_images, test_images = [], [], []
for database_name, slides_dict in self._database_slide_dict.items():
print(database_name)
image_paths_by_slide = [(len(v[0]), v[0], v[1], v[2], k) for k, v in slides_dict.items()]
random.shuffle(image_paths_by_slide)
# image_paths_by_slide.sort()
class_slides_dict = {}
for item in tqdm(image_paths_by_slide):
class_name = None
normal_percent = int(item[2].strip(r"(|)|\'").split("\', \'")[0])
tumor_percent = int(item[2].strip(r"(|)|\'").split("\', \'")[1])
stormal_percent = int(item[2].strip(r"(|)|\'").split("\', \'")[2])
if stormal_percent == 0:
if tumor_percent == 100:
class_name = 100
elif normal_percent == 100:
class_name = 0
else:
class_name = tumor_percent
class_name = class_name if class_name is not None else item[2]
if class_name in Config.class_names:
class_slides_dict[class_name] = class_slides_dict.get(class_name, []) + [
(item[0], item[1], class_name, item[4])]

# split test val train because they must not share same slide id fragment

for thyroid_class, slide_frags in class_slides_dict.items():
dataset_train_images, dataset_val_images, dataset_test_images = [], [], []
total_counts = sum([item[0] for item in slide_frags])
test_counts = total_counts * test_percent // 100
val_counts = total_counts * val_percent // 100
train_counts = total_counts - test_counts - val_counts
for i, slide_frags_item in enumerate(slide_frags):
items_paths = [(item_path, slide_frags_item[3]) for item_path in slide_frags_item[1]]
if len(dataset_train_images) + slide_frags_item[0] <= train_counts:
dataset_train_images += items_paths
elif len(dataset_val_images) + slide_frags_item[0] <= val_counts:
dataset_val_images += items_paths
else:
dataset_test_images += items_paths
train_images += [(i, (thyroid_class, j)) for i, j in dataset_train_images]
val_images += [(i, (thyroid_class, j)) for i, j in dataset_val_images]
test_images += [(i, (thyroid_class, j)) for i, j in dataset_test_images]

return train_images, val_images, test_images


if __name__ == '__main__':
# datasets_folder = ["national_cancer_institute"]
datasets_folder = ["papsociaty"]
# datasets_folder = ["stanford_tissue_microarray"]
# datasets_folder = ["bio_atlas_at_jake_gittlen_laboratories"]
train, val, test = CustomFragmentLoader(datasets_folder).load_image_path_and_labels_and_split(
val_percent=Config.val_percent,
test_percent=Config.test_percent)
benign_train = [i for i in train if i[1] == "BENIGN"]
mal_train = [i for i in train if i[1] == "MALIGNANT"]
print(f"train: {len(train)}={len(benign_train)}+{len(mal_train)}")
benign_val = [i for i in val if i[1] == "BENIGN"]
mal_val = [i for i in val if i[1] == "MALIGNANT"]
print(f"val: {len(val)}={len(benign_val)}+{len(mal_val)}")
benign_test = [i for i in test if i[1] == "BENIGN"]
mal_test = [i for i in test if i[1] == "MALIGNANT"]
print(f"test: {len(test)}={len(benign_test)}+{len(mal_test)}")

print(set(train) & set(test))
print(set(train) & set(val))
print(set(test) & set(val))
print(len(set(val) & set(val)))

+ 20
- 0
classifications_efforts/test_methods/test_model_for_bengin_malignant/model_train_logger.py View File

import os


def set_config_for_logger(config_label):
import logging
trains_state_dir = "./train_state"
if not os.path.isdir(trains_state_dir):
os.mkdir(trains_state_dir)
config_train_dir = os.path.join(trains_state_dir, config_label)
if not os.path.isdir(config_train_dir):
os.mkdir(config_train_dir)
log_file = os.path.join(config_train_dir, "console.log")
logger = logging.getLogger(config_label)
logger.setLevel(logging.DEBUG)
fh = logging.FileHandler(log_file)
formatter = logging.Formatter('%(asctime)s|%(levelname)s|%(message)s', datefmt='%Y-%m-%d %H:%M:%S')
fh.setFormatter(formatter)
fh.setLevel(logging.DEBUG)
logger.addHandler(fh)
return logger

+ 399
- 0
classifications_efforts/test_methods/test_model_for_bengin_malignant/model_training.py View File

import os
import random
import time
from typing import cast
import numpy as np
import matplotlib.pyplot as plt
import timm
import torch
import torchvision
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score
from torch import nn, optim
from torch.utils.data import DataLoader
from tqdm import tqdm

from config import Config
from fragment_splitter import CustomFragmentLoader
from model_train_logger import set_config_for_logger
from thyroid_dataset import ThyroidDataset
from thyroid_ml_model import ThyroidClassificationModel
from transformation import get_transformation


@torch.no_grad()
def validate(model, data_loader, loss_function=None, show_tqdm=False):
class_set = sorted(data_loader.dataset.class_to_idx_dict.values())

loss_values = []
y_preds = []
y_targets = []
y_positive_scores = []

for images, labels in (data_loader if not show_tqdm else tqdm(data_loader)):
images = images.to(Config.available_device)
labels = labels.to(Config.available_device)
x = model(images, validate=True)
if loss_function:
loss_values.append(loss_function(x, labels))
values, preds = torch.max(x, 1)

y_positive_scores += x[:, 1].cpu()
y_preds += preds.cpu()
y_targets += labels.cpu()

cf_matrix = confusion_matrix(y_targets, y_preds, normalize="true")

class_accuracies = [cf_matrix[c][c] for c in class_set]
acc = sum(class_accuracies)
acc /= len(class_set)
# TN|FN
# FP|TP
fpr, tpr, _ = roc_curve(y_targets, y_positive_scores)
auc = roc_auc_score(y_targets, y_positive_scores)
if loss_function:
loss = sum(loss_values)
loss /= len(loss_values)
return acc * 100, cf_matrix, (fpr, tpr, auc), loss
return acc * 100, cf_matrix, (fpr, tpr, auc)


def get_save_state_dirs(config_label, epoch=None):
trains_state_dir = "./train_state"
if not os.path.isdir(trains_state_dir):
os.mkdir(trains_state_dir)
config_train_dir = os.path.join(trains_state_dir, config_label)
if not os.path.isdir(config_train_dir):
os.mkdir(config_train_dir)
if epoch is not None:
save_state_dir = os.path.join(config_train_dir, f"epoch-{epoch}")
if not os.path.isdir(save_state_dir):
os.mkdir(save_state_dir)
else:
save_state_dir = None
return trains_state_dir, config_train_dir, save_state_dir


def plot_and_save_model_per_epoch(epoch,
model_to_save,
val_acc_list,
train_acc_list,
val_loss_list,
train_loss_list,
config_label):
trains_state_dir, config_train_dir, save_state_dir = get_save_state_dirs(config_label, epoch)

fig_save_path = os.path.join(config_train_dir, "val_train_acc.jpeg")
plt.plot(range(len(val_acc_list)), val_acc_list, label="validation")
plt.plot(range(len(train_acc_list)), train_acc_list, label="train")
plt.legend(loc="lower right")
plt.xlabel('Epoch')
plt.ylabel('Balanced Accuracy')
plt.savefig(fig_save_path)
plt.clf()

fig_save_path = os.path.join(config_train_dir, "val_train_loss.jpeg")
plt.plot(range(len(val_loss_list)), val_loss_list, label="validation")
plt.plot(range(len(train_loss_list)), train_loss_list, label="train")
plt.legend(loc="lower right")
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.savefig(fig_save_path)
plt.clf()

if model_to_save:
model_save_path = os.path.join(save_state_dir, "model.state")
model_to_save.save_model(model_save_path)


def save_auc_roc_chart_for_test(test_fpr, test_tpr, test_auc_score, config_label, epoch):
trains_state_dir, config_train_dir, save_dir = get_save_state_dirs(config_label, epoch)
fig_save_path = os.path.join(save_dir, f"test_roc_{time.time()}.jpeg")
plt.plot(test_fpr, test_tpr, label="test, auc=" + str(test_auc_score))
plt.legend(loc="lower right")
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.savefig(fig_save_path)
plt.clf()


def calculate_test(image_model, epoch, test_data_loader, logger, config_name, show_tqdm=False):
image_model.eval()
test_acc, test_c_acc, (test_FPR, test_TPR, test_auc_score) = validate(image_model,
test_data_loader,
show_tqdm=show_tqdm)
test_acc = float(test_acc)

save_auc_roc_chart_for_test(test_FPR, test_TPR, test_auc_score, config_name, epoch)
logger.info(f'Test|Epoch:{epoch}|Accuracy:{round(test_acc, 4)}, {test_c_acc}%')


def train_model(base_model, config_base_name, train_val_test_data_loaders, augmentation,
adaptation_sample_dataset=None,
train_model_flag=True,
load_model_from_dir=None):
config_name = f"{config_base_name}-{augmentation}-{','.join(Config.class_idx_dict.keys())}"

logger = set_config_for_logger(config_name)
logger.info(f"training config: {config_name}")
try:
_is_inception = type(base_model) == torchvision.models.inception.Inception3
train_data_loader, val_data_loader, test_data_loader = train_val_test_data_loaders
logger.info(
f"train valid test splits:" +
f" {len(train_data_loader.dataset.samples) if train_data_loader else None}," +
f" {len(val_data_loader.dataset.samples) if val_data_loader else None}," +
f" {len(test_data_loader.dataset.samples) if test_data_loader else None}")

# MODEL
if load_model_from_dir:
# Load model from file
model_path = os.path.join(load_model_from_dir, 'model.state')
image_model = ThyroidClassificationModel(base_model).load_model(model_path).to(Config.available_device)
else:
image_model = ThyroidClassificationModel(base_model).to(Config.available_device)

if train_model_flag:
# TRAIN
transformation = get_transformation(augmentation=augmentation, base_dataset=adaptation_sample_dataset)
train_dataset = cast(ThyroidDataset, train_data_loader.dataset)
train_dataset.transform = transformation

cec = nn.CrossEntropyLoss(weight=torch.tensor(train_dataset.class_weights).to(Config.available_device))
optimizer = optim.Adam(image_model.parameters(), lr=Config.learning_rate)
my_lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer=optimizer, gamma=Config.decay_rate)

val_acc_history = []
train_acc_history = []
train_y_preds = []
train_y_targets = []
best_epoch_val_acc = 0

for epoch in range(Config.n_epoch):
# variables to calculate train acc
class_set = sorted(train_data_loader.dataset.class_to_idx_dict.values())

for images, labels in tqdm(train_data_loader, colour="#0000ff"):
if len(images) >= Config.batch_size // 2:
image_model.train()
images = images.to(Config.available_device)
labels = labels.to(Config.available_device)
optimizer.zero_grad()
pred = image_model(images)
# pred label: torch.max(pred, 1)[1], labels
if _is_inception:
pred, aux_pred = pred
loss, aux_loss = cec(pred, labels), cec(aux_pred, labels)
loss = loss + 0.4 * aux_loss
else:
loss = cec(pred, labels)
loss.backward()
optimizer.step()

# train preds and labels
values, preds = torch.max(pred, 1)
train_y_preds.extend(preds.cpu())
train_y_targets.extend(labels.cpu())

# Epoch level
# validation data
image_model.eval()

train_cf_matrix = confusion_matrix(train_y_targets, train_y_preds, normalize="true")

class_accuracies = [train_cf_matrix[c][c] for c in class_set]
train_acc = sum(class_accuracies)
train_acc /= len(class_set)

train_acc = (100 * sum(class_accuracies) / len(class_set)).item()
train_acc_history.append(train_acc)
logger.info(f'Train|E:{epoch}|Balanced Accuracy:{round(train_acc, 4)}%,\n{train_cf_matrix}')

val_acc, val_cf_matrix, _, val_loss = validate(image_model,
val_data_loader,
cec)
val_acc = float(val_acc)
val_acc_history.append(val_acc)
logger.info(f'Val|E:{epoch}|Balanced Accuracy:{round(val_acc, 4)}%,\n{val_cf_matrix}')

save_model = False
is_last_epoch = epoch == Config.n_epoch
is_a_better_epoch = val_acc >= best_epoch_val_acc
is_a_better_epoch &= abs(train_acc - val_acc) < Config.train_val_acc_max_distance_for_best_epoch
if is_a_better_epoch or is_last_epoch:
save_model = True
calculate_test(image_model, epoch, test_data_loader, logger, config_name, show_tqdm=False)
plot_and_save_model_per_epoch(epoch if save_model else None,
image_model if save_model else None,
val_acc_history,
train_acc_history,
[],
[],
config_label=config_name)
my_lr_scheduler.step()
else:
# JUST EVALUATE
calculate_test(image_model, 0, test_data_loader, logger, config_name,
show_tqdm=True)
except Exception as e:
print(e)
logger.error(str(e))
raise e


def load_datasets(datasets_folders, test_percent=Config.test_percent, val_percent=Config.val_percent, sample_percent=1,
is_nci_per_slide=False):
if is_nci_per_slide:
l_train, l_val, l_test = CustomFragmentLoader(
datasets_folders).national_cancer_image_and_labels_splitter_per_slide(
test_percent=test_percent,
val_percent=val_percent)
else:
l_train, l_val, l_test = CustomFragmentLoader(datasets_folders).load_image_path_and_labels_and_split(
test_percent=test_percent,
val_percent=val_percent)

l_train = random.choices(l_train, k=int(sample_percent * len(l_train)))
l_val = random.choices(l_val, k=int(sample_percent * len(l_val)))
l_test = random.choices(l_test, k=int(sample_percent * len(l_test)))

l_train_ds = ThyroidDataset(l_train, Config.class_idx_dict)
l_val_ds = ThyroidDataset(l_val, Config.class_idx_dict)
l_test_ds = ThyroidDataset(l_test, Config.class_idx_dict)

l_train_data_loader = None
if l_train:
l_train_data_loader = DataLoader(l_train_ds, batch_size=Config.batch_size, shuffle=True)
l_val_data_loader = None
if l_val:
l_val_data_loader = DataLoader(l_val_ds, batch_size=Config.eval_batch_size, shuffle=True)
l_test_data_loader = None
if l_test:
l_test_data_loader = DataLoader(l_test_ds, batch_size=Config.eval_batch_size, shuffle=True)

return (l_train, l_val, l_test), (l_train_ds, l_val_ds, l_test_ds), (
l_train_data_loader, l_val_data_loader, l_test_data_loader)


@torch.no_grad()
def evaluate_nci_dataset_per_slide(config_base_name, augmentation, base_model, data_loader,
load_model_from_dir):
config_name = f"{config_base_name}-{augmentation}-tumor-percent"

logger = set_config_for_logger(config_name)
logger.info(f"training config: {config_name}")
_is_inception = type(base_model) == torchvision.models.inception.Inception3
logger.info(
f"test:" +
f" {len(data_loader.dataset.samples) if data_loader else None}")

# MODEL
# Load model from file
model_path = os.path.join(load_model_from_dir, 'model.state')
model = ThyroidClassificationModel(base_model).load_model(model_path).to(Config.available_device)

y_positive_scores = []
slides_preds = {}
slide_labels = {}
for images, (labels, slides) in tqdm(data_loader):
images = images.to(Config.available_device)

x = model(images, validate=True).cpu()
preds = x[:, 1]
logger.info("zero and 1000 percent")
logger.info(x[:, 0])
logger.info(x[:, 1])
for row_index in range(len(labels)):
slide_id = slides[row_index]
slide_label = labels[row_index]
slide_labels[slide_id] = slide_label
slides_preds[slide_id] = slides_preds.get(slide_id, []) + [preds[row_index].item()]
y_positive_scores += x[:, 1].cpu()

y_targets = []
y_preds = []
for key, value in slides_preds.items():
slides_preds[key] = (sum(slides_preds[key]) / len(slides_preds[key])) * 100
y_preds.append(slides_preds[key])
y_targets.append(int(slide_labels[key]))

y_targets_rounded = [int(round(x / 100, 1) * 100) for x in y_targets]
y_preds_rounded = [int(round(x / 100, 1) * 100) for x in y_preds]
cf_matrix = confusion_matrix(y_targets_rounded, y_preds_rounded, labels=Config.class_names, normalize="true")

class_accuracies = [cf_matrix[c][c] for c in range(len(cf_matrix))]
class_weights = [sum(cf_matrix[c]) for c in range(len(cf_matrix))]
acc = sum([class_accuracies[i] * class_weights[i] for i in range(len(class_accuracies))])
acc /= sum(class_weights)
# TN|FN
# FP|TP
# fpr, tpr, _ = roc_curve(y_targets, y_positive_scores)
# auc = roc_auc_score(y_targets, y_positive_scores)
logger.info(f"target rounded:{y_targets_rounded}")
logger.info(f"pred rounded:{y_preds_rounded}")
logger.info(f"Results| acc:{acc * 100}\ncf:{cf_matrix}")
return acc * 100, cf_matrix


##########
## Runs ##
##########
# train_phase block
if __name__ == '__main__' and Config.train_phase:
_, (train_ds, _, _), (train_data_loader, val_data_loader, test_data_loader) = load_datasets(
["national_cancer_institute"],
sample_percent=1)

# Domain adaptation dataset on small real datasets
# _, (_, _, domain_sample_test_dataset), _ = load_datasets(["stanford_tissue_microarray",
# "papsociaty"],
# sample_percent=0.5,
# test_percent=100,
# val_percent=0)

for c_base_name, model, augmentations in [
(f"resnet101_{Config.learning_rate}_{Config.decay_rate}_nci_final",
torchvision.models.resnet101(pretrained=True, progress=True), [
"mixup",
# "jit",
# "fda",
# "jit-fda-mixup",
# "shear",
# "std"
]),
]:
for aug in augmentations:
Config.reset_random_seeds()
train_model(model, c_base_name, (train_data_loader, val_data_loader, test_data_loader),
augmentation=aug, adaptation_sample_dataset=train_ds)
# evaluate_phase block
if __name__ == '__main__' and Config.evaluate_phase:
# Main data
Config.class_names = [i for i in range(101)]
Config.class_idx_dict = {i: i for i in range(101)}
_, (train_ds, _, _), (_, _, test_data_loader) = load_datasets(
["national_cancer_institute",
],
sample_percent=1, test_percent=100, val_percent=0, is_nci_per_slide=True)

for c_base_name, model, aug_best_epoch_list in [
(f"resnet101_{Config.learning_rate}_{Config.decay_rate}_nci_eval",
torchvision.models.resnet101(pretrained=True, progress=True), [
("mixup", "train_state/resnet101_0.0001_1_nci_final-mixup-BENIGN,MALIGNANT/epoch-19/"),
]),
# (f"resnet101_{Config.learning_rate}_{Config.decay_rate}_test_nci_eval",
# torchvision.models.resnet101(pretrained=True, progress=True), [
# ("fda",
# "train_state/runs_0.0001_1_nic_test_benign_mal/resnet101_0.0001_1_nci-fda-BENIGN,MALIGNANT/epoch-3/"),
# ("mixup",
# "train_state/runs_0.0001_1_nic_test_benign_mal/resnet101_0.0001_1_nci-mixup-BENIGN,MALIGNANT/epoch-3/"),
# ("jit",
# "train_state/runs_0.0001_1_nic_test_benign_mal/resnet101_0.0001_1_nci-jit-BENIGN,MALIGNANT/epoch-3/"),
# ("jit-fda-mixup",
# "train_state/runs_0.0001_1_nic_test_benign_mal/resnet101_0.0001_1_nci-jit-fda-mixup-BENIGN,MALIGNANT/epoch-3/"),
# ]),

]:
for aug, best_epoch in aug_best_epoch_list:
Config.reset_random_seeds()
evaluate_nci_dataset_per_slide(c_base_name, aug, model, test_data_loader,
load_model_from_dir=best_epoch)

+ 3
- 0
classifications_efforts/test_methods/test_model_for_bengin_malignant/run_splitter.sh View File

export PYTHONPATH="${PYTHONPATH}:../../../";
export PYTHONPATH="${PYTHONPATH}:./";
python fragment_splitter.py;

+ 3
- 0
classifications_efforts/test_methods/test_model_for_bengin_malignant/run_test_models.sh View File

export PYTHONPATH="${PYTHONPATH}:../../../";
export PYTHONPATH="${PYTHONPATH}:./";
python model_training.py;

+ 83
- 0
classifications_efforts/test_methods/test_model_for_bengin_malignant/thyroid_dataset.py View File

import os

import numpy as np
from PIL import Image
from torch.utils.data import Dataset

from config import Config
from fragment_splitter import CustomFragmentLoader
from transformation import get_transformation
from utils import show_and_wait


class ThyroidDataset(Dataset):
def __init__(self, image_paths_labels_list, class_to_index, transform=None, force_to_size_with_padding=512):
super().__init__()
self.class_to_idx_dict = class_to_index
self.force_to_size_with_padding = force_to_size_with_padding
self.transform = transform
self.samples = self._make_dataset(image_paths_labels_list)
self.class_weights = self._calculate_class_weights(image_paths_labels_list)

def _calculate_class_weights(self, image_paths_labels_list):
class_counts = {}
for image_path, (label, slide) in image_paths_labels_list:
class_counts[label] = class_counts.get(label, 0) + 1

class_weights = [
(self.class_to_idx_dict.get(c, None), len(image_paths_labels_list) / (len(class_counts) * v)) for c, v
in
class_counts.items()]
class_weights.sort()
return [item[1] for item in class_weights]

def _make_dataset(self, image_paths_labels_list):
images = []
for image_path, (label, slide) in image_paths_labels_list:
if not os.path.exists(os.path.abspath(image_path)):
raise (RuntimeError(f"{image_path} not found."))
item = (image_path, (self.class_to_idx_dict.get(label, "Unknown label"), slide))
images.append(item)
return images

def __len__(self):
return len(self.samples)

def __getitem__(self, index):
path, target = self.samples[index]
image = Image.open(path)
image = image.convert('RGB')
image = self.add_margin(image)
image = np.array(image)
if self.transform is not None:
# show_and_wait(image, name=f"./transformations/{index}-original", wait=False, save=True)
image = self.transform(image=image)['image']
# image_show = np.moveaxis(image.cpu().detach().numpy(), 0, -1)
# show_and_wait(image_show, name=f"./transformations/{index}-transformed", save=True)
else:
transform = get_transformation(augmentation="min")

image = transform(image=image)['image']

return image, target

def add_margin(self, pil_img):
width, height = pil_img.size
new_width = self.force_to_size_with_padding
new_height = self.force_to_size_with_padding
result = Image.new("RGB", (new_width, new_height), (0, 0, 0))
top_padding = (new_height - height) // 2
left_padding = (new_width - width) // 2
result.paste(pil_img, (left_padding, top_padding))
return result


if __name__ == '__main__':
class_idx_dict = Config.class_idx_dict
datasets_folder = ["stanford_tissue_microarray", "papsociaty"]
train, val, test = CustomFragmentLoader(datasets_folder).load_image_path_and_labels_and_split()
train_ds = ThyroidDataset(train, class_idx_dict)
test_ds = ThyroidDataset(test, class_idx_dict)
val_ds = ThyroidDataset(val, class_idx_dict)
res = train_ds.__getitem__(0)
print(res)

+ 47
- 0
classifications_efforts/test_methods/test_model_for_bengin_malignant/thyroid_ml_model.py View File

import torch
import torchvision
from torch import nn


class ThyroidClassificationModel(nn.Module):
def __init__(self, base_model):
super().__init__()
self.base_model = base_model
self.classifier = nn.Sequential(
nn.Linear(1000, 500),
nn.BatchNorm1d(500),
nn.ReLU(),
nn.Linear(500, 100),
nn.BatchNorm1d(100),
nn.ReLU(),
nn.Linear(100, 2),
nn.BatchNorm1d(2),
nn.Softmax(dim=-1)
)
self._is_inception3 = type(base_model) == torchvision.models.inception.Inception3
if self._is_inception3:
self.classifier2 = nn.Sequential(
nn.Linear(1000, 500),
nn.BatchNorm1d(500),
nn.ReLU(),
nn.Linear(500, 100),
nn.BatchNorm1d(100),
nn.ReLU(),
nn.Linear(100, 2),
nn.BatchNorm1d(2),
nn.Softmax(dim=-1)
)

def forward(self, x, validate=False):
output = self.base_model(x.float())
if self._is_inception3 and not validate:
return self.classifier(output[0]), self.classifier2(output[1])
return self.classifier(output)

def save_model(self, path):
torch.save(self.state_dict(), path)

def load_model(self, path):
self.load_state_dict(torch.load(path))
self.eval()
return self

BIN
classifications_efforts/test_methods/test_model_for_bengin_malignant/train_state/fda-mixup-std-benign-malignant on stanford-papsociety.rar View File


BIN
classifications_efforts/test_methods/test_model_for_bengin_malignant/train_state/min-benign-malignant on stanford-papsociety.rar View File


+ 82
- 0
classifications_efforts/test_methods/test_model_for_bengin_malignant/transformation.py View File

import albumentations as A
from albumentations.pytorch import ToTensorV2

from albumentations_mixup import Mixup


def get_transformation(augmentation, crop_size=299, base_dataset=None):
scaled_center_crop_size = int(crop_size * 1.25)

def random_crop_transformation(x):
return A.RandomCrop(x, x, always_apply=True)

def get_flip_rotate__custom__noise_transform(transform_list, random_scale=True):
return A.Compose([
A.Flip(p=0.25),
A.Rotate(p=0.25),
A.RandomScale(scale_limit=0.5, p=0.5 if random_scale else 0),
A.PadIfNeeded(min_height=scaled_center_crop_size, min_width=scaled_center_crop_size,
always_apply=True),
A.CenterCrop(scaled_center_crop_size, scaled_center_crop_size),
random_crop_transformation(crop_size),
] + transform_list + [
A.Blur(p=0.25, blur_limit=2),
A.GaussNoise(p=0.25, var_limit=10),
ToTensorV2()
])

if augmentation == "min":
trans = A.Compose([
A.PadIfNeeded(min_height=scaled_center_crop_size, min_width=scaled_center_crop_size, always_apply=True),
A.CenterCrop(scaled_center_crop_size, scaled_center_crop_size),
random_crop_transformation(crop_size),
ToTensorV2()
])

elif augmentation == "std":
trans = get_flip_rotate__custom__noise_transform([])
elif augmentation == "jit-nrs":
trans = get_flip_rotate__custom__noise_transform([
A.ColorJitter(p=0.5, hue=.5)
], random_scale=False)
elif augmentation == "jit":
trans = get_flip_rotate__custom__noise_transform([
A.ColorJitter(p=0.5, hue=.5)
])
elif augmentation == "fda":
fda_image_paths = [sample[0] for sample in base_dataset.samples]
trans = get_flip_rotate__custom__noise_transform([
A.domain_adaptation.FDA(fda_image_paths, beta_limit=0.1, p=0.5)
])
elif augmentation == "mixup":
mixups = [sample[0:2] for sample in base_dataset.samples]
trans = get_flip_rotate__custom__noise_transform([
Mixup(mixups=mixups, p=0.5, beta_limit=(0.1)),
])
elif augmentation == "jit-fda-mixup":
p = 0.16
fda_image_paths = [sample[0] for sample in base_dataset.samples]
mixups = [sample[0:2] for sample in base_dataset.samples]
trans = get_flip_rotate__custom__noise_transform([
A.domain_adaptation.FDA(fda_image_paths, beta_limit=0.1, p=p),
Mixup(mixups=mixups, p=p, beta_limit=(0.1)),
A.ColorJitter(p=p, hue=.5)
])
elif augmentation == "jit-fda-mixup-nrs":
p = 0.16
fda_image_paths = [sample[0] for sample in base_dataset.samples]
mixups = [sample[0:2] for sample in base_dataset.samples]
trans = get_flip_rotate__custom__noise_transform([
A.domain_adaptation.FDA(fda_image_paths, beta_limit=0.1, p=p),
Mixup(mixups=mixups, p=p, beta_limit=(0.1)),
A.ColorJitter(p=p, hue=.5)
], random_scale=False)
elif augmentation == "shear":
trans = get_flip_rotate__custom__noise_transform([
A.Affine(shear={"x": (-10, 10), "y": (-10, 10)}, p=0.5)
], random_scale=False)


else:
raise ValueError(f"Augmentation unknown: {augmentation}")
return trans

+ 45
- 0
config.py View File

import random

import torch


class Config:
DEBUG = False

batch_size = 32
eval_batch_size = 128

test_percent = 20
val_percent = 10

learning_rate = 0.0001
decay_rate = 1 # 0.99**50=0.6, 0.99**100=0.36
n_epoch = 2 if DEBUG else 20

available_device = "cuda" if torch.cuda.is_available() and not DEBUG else "cpu"
print(f"Device: {available_device}")

workers = 1 if DEBUG else 40

# learned from evaluate_image_patcher_and_visualize.py
laplacian_threshold = 298

# RANDOM SEED
seed = 115

@staticmethod
def reset_random_seeds():
random.seed(Config.seed)
torch.manual_seed(Config.seed)

class_names = ["BENIGN", "MALIGNANT"]
class_idx_dict = {"BENIGN": 0, "MALIGNANT": 1}

train_val_acc_max_distance_for_best_epoch = 6 # Percent
n_epoch_for_image_patcher = 60

train_phase = False
evaluate_phase = False


Config.reset_random_seeds()

+ 0
- 0
database_crawlers/__init__.py View File


+ 0
- 0
database_crawlers/bio_atlas_at_jake_gittlen_laboratories/__init__.py View File


+ 64
- 0
database_crawlers/bio_atlas_at_jake_gittlen_laboratories/database_crawler.py View File

import ssl
import time
from urllib.parse import urlparse
from urllib.request import urlopen

from bs4 import BeautifulSoup

from database_crawlers.web_stain_sample import StainType, WebStainWSIOneDIndex

ssl._create_default_https_context = ssl._create_unverified_context


class BioAtlasAtJakeGittlenLaboratoriesImage(WebStainWSIOneDIndex):

def __init__(self, database_name, image_id, image_web_label, report, stain_type, is_wsi):
super().__init__(database_name, image_id, image_web_label, report, stain_type, is_wsi)

def _get_tile_url(self, zoom, partition=None, i=None, j=None):
return f"https://bio-atlas.psu.edu/human/tile.jpeg.php?s={self.image_id}&z={zoom}&i={partition}"

def get_slide_view_url(self):
return f"https://bio-atlas.psu.edu/human/view.php?s={self.image_id}"

def _get_file_path_name(self):
return self.save_path + self.image_id

def find_best_zoom(self):
return 0


class BioAtlasThyroidSlideProvider:
page_link = "https://bio-atlas.psu.edu/human/search.php?q=Thyroid&organism%5B%5D=5&age_fr=&age_fr_units=1&age_to=&age_to_units=1&sex%5B%5D=all&thumbnails=on&rpp=30&as_sfid=AAAAAAW0RrspdnblpiFwz8osoAdvS8nafd1J9LG_ARQ-IF_NZ3aI2EXCMDBeqE_iD5rUo1QLg454tS63DMSgATSzgrksb4rMi-GWPl3O9f3JKlqGn8oXoqbOYok3__yZx69ewzg%3D&as_fid=6900aeb3e4cc9f39ef9738a2f11c2cefb8c3f37c#results"
database_name = "BioAtlasThyroidSlideProvider"
stain_type = StainType.H_AND_E
is_wsi = True

@classmethod
def get_web_stain_samples(cls):
print(cls.page_link)
try:
html_text = urlopen(cls.page_link).read()
soup = BeautifulSoup(html_text, 'html.parser')
search_results = soup.find_all("div", {"class": "shadow-box search-result-item search-result-slide"})
for result_item in search_results:
image_view_url = result_item.find("a").attrs['href']
query_param = urlparse(image_view_url).query.split("=")
if query_param[0] != "s": raise Exception("Query params does not contains image url")
image_id = query_param[1]
image_web_label = str(result_item.find("b", text="Diagnosis").next_sibling)
yield BioAtlasAtJakeGittlenLaboratoriesImage(cls.database_name, image_id, image_web_label, None,
cls.stain_type, cls.is_wsi)
except Exception as e:
print(e)
time.sleep(2)
yield cls.get_web_stain_samples()


if __name__ == '__main__':
bio_atlas_provider = BioAtlasThyroidSlideProvider()
for slide in bio_atlas_provider.get_web_stain_samples():
if slide.image_id == "687":
print(slide.image_id, slide.image_web_label, slide.get_slide_view_url())
slide.crawl_image_save_jpeg_and_json()
break

+ 3
- 0
database_crawlers/bio_atlas_at_jake_gittlen_laboratories/run_bio_atlas_crawler.sh View File

export PYTHONPATH="${PYTHONPATH}:../../";
export PYTHONPATH="${PYTHONPATH}:./";
python database_crawler.py;

+ 61
- 0
database_crawlers/heidelberg_pathology/database_crawler.py View File

import time
from urllib.parse import urlparse
from urllib.request import urlopen

from bs4 import BeautifulSoup

from database_crawlers.web_stain_sample import StainType, WebStainWSITwoDIndex


class HeidelbergPathologyImage(WebStainWSITwoDIndex):

def __init__(self, database_name, image_id, image_web_label, report, stain_type, is_wsi):
super().__init__(database_name, image_id, image_web_label, report, stain_type, is_wsi)

def _get_tile_url(self, zoom, partition=None, i=None, j=None):
return f"https://eliph.klinikum.uni-heidelberg.de/dzi/atlas/05-schilddruese/05-{'%.2d' % int(self.image_id)}_files/{zoom}/{i}_{j}.jpeg"

def get_slide_view_url(self):
return f"https://eliph.klinikum.uni-heidelberg.de/atlas/?c=05-schilddruese&context=image&pg={self.image_id}"

def _get_file_path_name(self):
return self.save_path + self.image_id

def find_best_zoom(self):
# 16 -> 0
return 16


class HeidelbergPathologyProvider:
page_link = "https://eliph.klinikum.uni-heidelberg.de/atlas/?c=05-schilddruese&context=image"
database_name = "HeidelbergPathology"
stain_type = StainType.H_AND_E
is_wsi = True

@classmethod
def get_web_stain_samples(cls):
print(cls.page_link)
try:
html_text = urlopen(cls.page_link).read()
soup = BeautifulSoup(html_text, 'html.parser')
search_results = soup.find_all("div", {"class": "casegrid"})
for result_item in search_results:
image_view_url = result_item.find("a").attrs['href']
query_param = urlparse(image_view_url).query.split("=")
if "image&pg" not in query_param: raise Exception("Query params does not contains image id")
image_id = query_param[-1]
image_web_label = str(result_item.find("b").next)
yield HeidelbergPathologyImage(cls.database_name, image_id, image_web_label, None,
cls.stain_type, cls.is_wsi)
except Exception as e:
print(e)
time.sleep(2)
yield cls.get_web_stain_samples()


if __name__ == '__main__':
bio_atlas_provider = HeidelbergPathologyProvider()
for slide in bio_atlas_provider.get_web_stain_samples():
print(slide.image_id, slide.image_web_label, slide.get_slide_view_url())
slide.crawl_image_save_jpeg_and_json()
break

+ 7
- 0
database_crawlers/image_patcher/bio_atlas_patcher.py View File

from image_patcher import ImageAndSlidePatcher

if __name__ == '__main__':
database_folder_name = "bio_atlas_at_jake_gittlen_laboratories"
database_directory = "../"
image_slide_patcher = ImageAndSlidePatcher()
image_slide_patcher.save_patches_in_folders(database_directory, database_folder_name)

+ 287
- 0
database_crawlers/image_patcher/image_patcher.py View File

import csv
import json
import os
import os.path as os_path
import random
import re
from math import ceil
from os import listdir
from os.path import isfile, join

import cv2
import tifffile
import zarr as ZarrObject
from tqdm import tqdm

from config import Config
from database_crawlers.web_stain_sample import ThyroidCancerLevel, WebStainImage
from utils import show_and_wait


class ThyroidFragmentFilters:
@staticmethod
def func_laplacian_threshold(threshold=Config.laplacian_threshold):
def wrapper(image_nd_array):
res = ThyroidFragmentFilters._empty_frag_with_laplacian_threshold(image_nd_array, threshold)
return res

return wrapper

@staticmethod
def _empty_frag_with_laplacian_threshold(image_nd_array, threshold=Config.laplacian_threshold,
return_variance=False):
gray = cv2.cvtColor(image_nd_array, cv2.COLOR_BGR2GRAY)
gray = cv2.GaussianBlur(gray, (3, 3), 0)

laplacian = cv2.Laplacian(gray, cv2.CV_64F, ksize=3, )
std = cv2.meanStdDev(laplacian)[1][0][0]

variance = std ** 2
if return_variance:
return variance >= threshold, variance
return variance >= threshold


class ImageAndSlidePatcher:
@classmethod
def _check_magnification_from_description(cls, tiff_address):
try:
tif_file_obj = tifffile.TiffFile(tiff_address)
image_description = tif_file_obj.pages.keyframe.tags["ImageDescription"].value
app_mag = int(re.findall("(AppMag = [0-9]+)", image_description)[0].split(" = ")[-1])
return app_mag
except Exception as e:
return None

@classmethod
def _zarr_loader(cls, tiff_address, key=0):
image_zarr = tifffile.imread(tiff_address, aszarr=True, key=key, )
zarr = ZarrObject.open(image_zarr, mode='r')
return zarr

@classmethod
def _jpeg_loader(cls, jpeg_address):
im = cv2.imread(jpeg_address)
return im

@classmethod
def _json_key_loader(cls, json_file_address, key=None):
with open(json_file_address, 'rb') as file:
json_dict = json.loads(file.read())
if key:
return json_dict[key]
return json_dict

@classmethod
def _get_extension_from_path(cls, file_path):
return os_path.splitext(file_path)[-1]

@classmethod
def _get_file_name_from_path(cls, file_path):
return ".".join(os_path.split(file_path)[-1].split(".")[:-1])

@classmethod
def _get_number_of_initial_frags(cls, zarr_object, frag_size=512, frag_overlap=0.1):
zarr_shape = zarr_object.shape

step_size = int(frag_size * (1 - frag_overlap))
overlap_size = frag_size - step_size
w_range = list(range(0, ceil((zarr_shape[0] - overlap_size) / step_size) * step_size, step_size))
h_range = list(range(0, ceil((zarr_shape[1] - overlap_size) / step_size) * step_size, step_size))
return len(w_range) * len(h_range)

@classmethod
def _generate_raw_fragments_from_image_array_or_zarr(cls, image_object, frag_size=512, frag_overlap=0.1,
shuffle=True):
def frag_picker(w_pos, h_pos):
end_w, end_h = min(zarr_shape[0], w_pos + frag_size), min(zarr_shape[1], h_pos + frag_size)
start_w, start_h = end_w - frag_size, end_h - frag_size
return image_object[start_w:end_w, start_h: end_h], (start_w, start_h)

if image_object is None:
return None
zarr_shape = image_object.shape

step_size = int(frag_size * (1 - frag_overlap))
overlap_size = frag_size - step_size
w_range = list(range(0, ceil((zarr_shape[0] - overlap_size) / step_size) * step_size, step_size))

h_range = list(range(0, ceil((zarr_shape[1] - overlap_size) / step_size) * step_size, step_size))

if shuffle:
pos_list = [None] * len(w_range) * len(h_range)
index = 0
for w in w_range:
for h in h_range:
pos_list[index] = (w, h)
index += 1
random.shuffle(pos_list)
for w, h in pos_list:
yield frag_picker(w, h)
else:
for w in w_range:
for h in h_range:
yield frag_picker(w, h)

@classmethod
def _filter_frag_from_generator(cls, frag_generator, filter_func_list, return_all_with_condition=False,
all_frag_count=None, output_file=None):
for next_test_item, frag_pos in tqdm(frag_generator, total=all_frag_count, file=output_file,
postfix="Filtering", position=0):
condition = True
for function in filter_func_list:
condition &= function(next_test_item)
if return_all_with_condition:
yield next_test_item, frag_pos, condition
elif condition:
# show_and_wait(frag)
yield next_test_item, frag_pos

@classmethod
def _get_json_and_image_address_of_directory(cls, directory_path, ignore_json=False):
image_formats = [".jpeg", ".tiff", ".jpg"]
json_format = ".json"
files = [f for f in listdir(directory_path) if isfile(join(directory_path, f))]
files.sort()
pairs = {}
for file_path in files:
file_path = join(directory_path, file_path)
file_name = cls._get_file_name_from_path(file_path)
pairs[file_name] = pairs.get(file_name, [None, None])
if cls._get_extension_from_path(file_path) in image_formats:
pairs[file_name][1] = file_path
elif cls._get_extension_from_path(file_path) == json_format:
pairs[file_name][0] = file_path
if ignore_json:
return [value for key, value in pairs.values() if value is not None]
return [(key, value) for key, value in pairs.values() if key is not None and value is not None]

@staticmethod
def create_patch_dir_and_initialize_csv(database_path):
data_dir = os.path.join(database_path, "data")
patch_dir = os.path.join(database_path, "patches")
if not os.path.isdir(patch_dir):
os.mkdir(patch_dir)
label_csv_path = os.path.join(patch_dir, "patch_labels.csv")
csv_file = open(label_csv_path, "a+")
csv_writer = csv.writer(csv_file)
csv_file.seek(0)
if len(csv_file.read(100)) <= 0:
csv_writer.writerow(WebStainImage.sorted_json_keys())
return data_dir, patch_dir, csv_writer, csv_file

@classmethod
def save_image_patches_and_update_csv(cls, thyroid_type, thyroid_desired_classes, csv_writer, web_details,
image_path, slide_patch_dir, slide_id):
csv_writer.writerow(list(web_details.values()))

if cls._get_extension_from_path(image_path) in [".tiff", ".tif", ".svs"]:
zarr_object = cls._zarr_loader(image_path)
generator = cls._generate_raw_fragments_from_image_array_or_zarr(zarr_object)
total_counts = cls._get_number_of_initial_frags(zarr_object=zarr_object)
else:
jpeg_image = cls._jpeg_loader(image_path)
jpeg_image = cls.ask_image_scale_and_rescale(jpeg_image)
generator = cls._generate_raw_fragments_from_image_array_or_zarr(jpeg_image)
total_counts = cls._get_number_of_initial_frags(zarr_object=jpeg_image)
if generator is None:
return

if not os.path.isdir(slide_patch_dir):
os.mkdir(slide_patch_dir)
filters = [ThyroidFragmentFilters.func_laplacian_threshold(Config.laplacian_threshold)]
fragment_id = 0
slide_progress_file_path = os.path.join(slide_patch_dir, "progress.txt")
with open(slide_progress_file_path, "w") as file:
for fragment, frag_pos in cls._filter_frag_from_generator(generator, filters, all_frag_count=total_counts,
output_file=file):
fragment_file_path = os.path.join(slide_patch_dir, f"{slide_id}-{fragment_id}.jpeg")
cv2.imwrite(fragment_file_path, fragment)
fragment_id += 1
return fragment_id, total_counts

@classmethod
def save_patches_in_folders(cls, database_directory, dataset_dir=None):
thyroid_desired_classes = [ThyroidCancerLevel.MALIGNANT, ThyroidCancerLevel.BENIGN]
datasets_dirs = os.listdir(database_directory) if dataset_dir is None else [dataset_dir]
list_dir = [os.path.join(database_directory, o) for o in datasets_dirs
if os.path.isdir(os.path.join(database_directory, o, "data"))]
for database_path in list_dir:
print("database path: ", database_path)
data_dir, patch_dir, csv_writer, csv_file = cls.create_patch_dir_and_initialize_csv(database_path)
for json_path, image_path in cls._get_json_and_image_address_of_directory(data_dir):
print("image path: ", image_path)
file_name = cls._get_file_name_from_path(image_path)
slide_id = str(hash(file_name))
slide_patch_dir = os.path.join(patch_dir, slide_id)
if os.path.isdir(slide_patch_dir):
"""
it has already been patched
"""
continue

web_details = cls._json_key_loader(json_path)
web_details["image_id"] = slide_id
web_label = web_details["image_web_label"]
thyroid_type = ThyroidCancerLevel.get_thyroid_level_from_diagnosis_label(web_label)
web_details["image_class_label"] = thyroid_type.value[1]

cls.save_image_patches_and_update_csv(thyroid_type, thyroid_desired_classes, csv_writer, web_details,
image_path, slide_patch_dir, slide_id)
csv_file.close()

@classmethod
def save_papsociaty_patch(cls, database_path):
thyroid_desired_classes = [ThyroidCancerLevel.MALIGNANT, ThyroidCancerLevel.BENIGN]
print("database path: ", database_path)
for folder in Config.class_names:
group_path = os.path.join(database_path, "data", folder)
data_dir, patch_dir, csv_writer, csv_file = cls.create_patch_dir_and_initialize_csv(database_path)
for image_path in cls._get_json_and_image_address_of_directory(group_path, ignore_json=True):
print("image path: ", image_path)
file_name = cls._get_file_name_from_path(image_path)
slide_id = str(hash(file_name))
slide_patch_dir = os.path.join(patch_dir, slide_id)
if os.path.isdir(slide_patch_dir):
"""
it has already been patched
"""
continue
web_label = folder + "-" + file_name
thyroid_type = ThyroidCancerLevel.get_thyroid_level_from_diagnosis_label(web_label)
web_details = {"database_name": "PapSociety",
"image_id": slide_id,
"image_web_label": web_label,
"image_class_label": thyroid_type.value[1],
"report": None,
"stain_type": "UNKNOWN",
"is_wsi": False}
cls.save_image_patches_and_update_csv(thyroid_type, thyroid_desired_classes, csv_writer, web_details,
image_path, slide_patch_dir, slide_id)

csv_file.close()

@classmethod
def ask_image_scale_and_rescale(cls, image):
# small: S, Medium: M, Large:L
show_and_wait(image)
res = input("how much plus pointer fill a cell(float, i:ignore, else repeat): ")
try:
if res == "i":
return None
elif re.match("[0-9]+(.[0-9]*)?", res):
scale = 1 / float(res)
return cv2.resize(image, (0, 0), fx=scale, fy=scale)
else:
return cls.ask_image_scale_and_rescale(image)
except Exception as e:
print(e)
return cls.ask_image_scale_and_rescale(image)


if __name__ == '__main__':
random.seed(1)

database_directory = "./"
# ImageAndSlidePatcher.save_patches_in_folders(database_directory, dataset_dir=["stanford_tissue_microarray"])
# ImageAndSlidePatcher.save_papsociaty_patch(os.path.join(database_directory, "papsociaty"))

+ 60
- 0
database_crawlers/image_patcher/national_cancer_patcher.py View File

import concurrent.futures
import os
import pathlib

from tqdm import tqdm

from config import Config
from image_patcher import ImageAndSlidePatcher
from national_cancer_institute.read_xml_file import get_slide_info_from_bcr_xml


def save_national_cancer_institute_patch(database_path):
def patch_image(image_path):
try:
image_path = str(image_path)
print()
print("image path: ", image_path)
file_name = ImageAndSlidePatcher._get_file_name_from_path(image_path)
slide_id = file_name.split(".")[0]
slide_patch_dir = os.path.join(patch_dir, slide_id)
if os.path.isdir(slide_patch_dir):
print("it has already been patched")
return
web_label = slide_infos.get(slide_id, None)
if web_label is None:
print("Ignored")
return
web_details = {"database_name": "NationalCancerInstitute",
"image_id": slide_id,
"image_web_label": web_label,
"image_class_label": web_label,
"report": None,
"stain_type": "H&E",
"is_wsi": True}
return ImageAndSlidePatcher.save_image_patches_and_update_csv(web_label, None, csv_writer, web_details,
image_path, slide_patch_dir, slide_id)
except Exception as e:
print(e)

data_dir = os.path.join(database_path, "data")
slide_infos = {}
for xml_path in pathlib.Path(data_dir).glob("**/*.xml"):
slide_infos.update(get_slide_info_from_bcr_xml(str(xml_path)))

data_dir, patch_dir, csv_writer, csv_file = ImageAndSlidePatcher.create_patch_dir_and_initialize_csv(database_path)
csv_file.flush()

with concurrent.futures.ThreadPoolExecutor(max_workers=Config.workers) as executor:
image_paths = pathlib.Path(data_dir).glob("**/*.svs")
image_paths = [i for i in image_paths]
print()
for res in tqdm(executor.map(patch_image, image_paths), total=len(image_paths)):
if res:
csv_file.flush()
csv_file.flush()


if __name__ == '__main__':
database_directory = "../"
save_national_cancer_institute_patch(os.path.join(database_directory, "national_cancer_institute"))

+ 4
- 0
database_crawlers/image_patcher/run_bio_atlas_patcher.sh View File

export PYTHONPATH="${PYTHONPATH}:../../";
export PYTHONPATH="${PYTHONPATH}:../";
export PYTHONPATH="${PYTHONPATH}:./";
python bio_atlas_patcher.py;

+ 4
- 0
database_crawlers/image_patcher/run_image_patcher.sh View File

export PYTHONPATH="${PYTHONPATH}:../../";
export PYTHONPATH="${PYTHONPATH}:../";
export PYTHONPATH="${PYTHONPATH}:./";
python image_patcher.py;

+ 4
- 0
database_crawlers/image_patcher/run_national_image_patcher.sh View File

export PYTHONPATH="${PYTHONPATH}:../../";
export PYTHONPATH="${PYTHONPATH}:../";
export PYTHONPATH="${PYTHONPATH}:./";
python national_cancer_patcher.py;

+ 280
- 0
database_crawlers/image_patcher/test_image_patcher/nci_svs_and_masks/evaluate_image_patcher_and_visualize.py View File

import concurrent.futures
import math

import cv2
import matplotlib.pyplot as plt
import numpy as np

from config import Config
from database_crawlers.image_patcher.image_patcher import ImageAndSlidePatcher, ThyroidFragmentFilters
from utils import check_if_generator_is_empty


def imul(a, b):
return math.ceil(a * b)


def calculate_acc_and_sensitivity(image_path, zarr_loader_mask, zarr_loader, frag_generator, scaled_masked_image,
generated_mask_scale, laplacian_threshold, slide_patch_size,
save_generated_image=True):
def process_frag(args):
next_test_item, frag_pos, condition = args
frag_shape = next_test_item.shape
mask_scaled_frag_shape = list((imul(frag_shape[i], mask_scale) for i in range(2)))

mask_frag_pos = list((imul(frag_pos[i], mask_scale) for i in range(2)))
mask_w1, mask_w2 = mask_frag_pos[0], mask_frag_pos[0] + mask_scaled_frag_shape[0]
mask_h1, mask_h2 = mask_frag_pos[1], mask_frag_pos[1] + mask_scaled_frag_shape[1]
mask_item = zarr_loader_mask[mask_w1:mask_w2, mask_h1:mask_h2]
mask_item = cv2.resize(mask_item, dsize=(0, 0), fx=1 / mask_scale, fy=1 / mask_scale)

fragment_size = next_test_item.shape
scaled_frag_size = (imul(fragment_size[0], generated_mask_scale), imul(fragment_size[1], generated_mask_scale))
scaled_frag = cv2.resize(next_test_item[:, :, :3], dsize=scaled_frag_size, interpolation=cv2.INTER_CUBIC)
scaled_frag_size = scaled_frag.shape

if next_test_item is not None:
mask_item = mask_item[:, :, 0]
masked = mask_item.mean() > 256 * .3
if condition and masked:
background_dict["TP"] += 1
elif condition and not masked:
background_dict["FP"] += 1
elif not condition and masked:
background_dict["FN"] += 1
# show_and_wait(next_test_item)
# show_and_wait(mask_item)
elif not condition and not masked:
background_dict["TN"] += 1
else:
return None
if not condition:
# background patches get dark
scaled_frag = (scaled_frag * 0.3).astype(np.int8)
scaled_pos = list((imul(frag_pos[i], generated_mask_scale) for i in range(2)))
try:
mask_g_w1, mask_g_w2 = scaled_pos[0], scaled_pos[0] + scaled_frag_size[0]
mask_g_h1, mask_g_h2 = scaled_pos[1], scaled_pos[1] + scaled_frag_size[1]
scaled_masked_image[mask_g_w1:mask_g_w2, mask_g_h1:mask_g_h2] = scaled_frag
except Exception as e:
print(e)
return True

mask_scale = zarr_loader_mask.shape[0] / zarr_loader.shape[0]

filter_func_list = [ThyroidFragmentFilters.func_laplacian_threshold(laplacian_threshold)]
background_dict = {"TP": 0, "FP": 0, "TN": 0, "FN": 0}
total_frags = slide_patch_size if slide_patch_size else ImageAndSlidePatcher._get_number_of_initial_frags(
zarr_loader)
frag_filtered = ImageAndSlidePatcher._filter_frag_from_generator(frag_generator, filter_func_list,
return_all_with_condition=True,
all_frag_count=total_frags)
with concurrent.futures.ThreadPoolExecutor(max_workers=Config.workers) as executor:
futures = []
patch_count = 0
for args in frag_filtered:
patch_count += 1
future_res = executor.submit(process_frag, args)
futures.append(future_res)
if len(futures) >= Config.workers or patch_count == slide_patch_size:
for future in concurrent.futures.as_completed(futures):
future.result()
futures = []
if patch_count == slide_patch_size:
break

if save_generated_image:
masked_image_path = ".".join(image_path.split(".")[:-1]) + "_generated_mask.jpg"
cv2.imwrite(masked_image_path, scaled_masked_image)

return background_dict


def score_calculator(accuracy, specificity, acc_w=0.75):
return accuracy * acc_w + specificity * (1 - acc_w)


def get_zarr_loaders_and_generators():
zarr_loaders_and_generators = []
for _img_mask_path, _img_path in image_lists:
_zarr_loader_mask = ImageAndSlidePatcher._zarr_loader(_img_mask_path)
_zarr_loader = ImageAndSlidePatcher._zarr_loader(_img_path)
_frag_generator = ImageAndSlidePatcher._generate_raw_fragments_from_image_array_or_zarr(_zarr_loader,
shuffle=True)
_zarr_shape = _zarr_loader.shape

_generated_mask_scale = 10 / 512
_scaled_zarr_shape = (
imul(_zarr_shape[0], _generated_mask_scale) + 5, imul(_zarr_shape[1], _generated_mask_scale) + 5, 3)
_scaled_masked_image = np.zeros(_scaled_zarr_shape)

zarr_loaders_and_generators.append([
_zarr_loader_mask, _zarr_loader, _frag_generator, _scaled_masked_image, _generated_mask_scale
])
return zarr_loaders_and_generators


def update_and_find_best_threshold(initial_thresh, learn_threshold_and_log_cf_matrix_per_patch=True):
initial_threshold_jump_size_const = 120
threshold_jump_size = initial_threshold_jump_size_const
decay_const = 0.85
decay_count = 0

threshold_jump_increase = 1

threshold_score = None
# update after initial run
laplacian_threshold = initial_thresh

threshold_history = []
score_history = []
for epoch in range((Config.n_epoch_for_image_patcher if learn_threshold_and_log_cf_matrix_per_patch else 1)):
print("New Epoch")
zarr_loaders_and_generators = get_zarr_loaders_and_generators()
whole_background_dict_per_slide = [{} for i in range(len(zarr_loaders_and_generators))]
whole_background_dict = {}

while sum([item is not None for item in zarr_loaders_and_generators]) >= 1:
none_empty_generators = [i for i in range(len(zarr_loaders_and_generators)) if
zarr_loaders_and_generators[i] is not None]

if learn_threshold_and_log_cf_matrix_per_patch:
whole_background_dict = {}
if len(none_empty_generators) >= 6 or not learn_threshold_and_log_cf_matrix_per_patch:
for slide_pick in none_empty_generators:
img_path = image_lists[slide_pick][1]
zarr_loader_mask = zarr_loaders_and_generators[slide_pick][0]
zarr_loader = zarr_loaders_and_generators[slide_pick][1]
frag_generator = zarr_loaders_and_generators[slide_pick][2]

generated_scaled_mask_image = zarr_loaders_and_generators[slide_pick][3]
generated_mask_scale = zarr_loaders_and_generators[slide_pick][4]

group_dict = calculate_acc_and_sensitivity(img_path,
zarr_loader_mask,
zarr_loader,
frag_generator,
generated_scaled_mask_image,
generated_mask_scale,
laplacian_threshold,
slide_patch_size=2000,
save_generated_image=not learn_threshold_and_log_cf_matrix_per_patch)
for i in range(len(zarr_loaders_and_generators)):
if zarr_loaders_and_generators[i]:
generator = check_if_generator_is_empty(zarr_loaders_and_generators[i][2])
if generator:
zarr_loaders_and_generators[i][2] = generator
else:
zarr_loaders_and_generators[i] = None

for key, value in group_dict.items():
whole_background_dict[key] = whole_background_dict.get(key, 0) + value
whole_background_dict_per_slide[slide_pick][key] = whole_background_dict_per_slide[
slide_pick].get(key, 0) + value

if learn_threshold_and_log_cf_matrix_per_patch:
e = .000001
total_preds = (sum(list(whole_background_dict.values())) + e)
acc = (whole_background_dict["TP"] + whole_background_dict["TN"]) / total_preds
positive_preds = (whole_background_dict["TP"] + whole_background_dict["FP"] + e)
precision = whole_background_dict["TP"] / positive_preds
next_score = score_calculator(acc, precision)
if threshold_score is None:
threshold_score = next_score
else:
threshold_history.append(laplacian_threshold)
score_history.append(next_score)
if next_score > threshold_score:
threshold_score = next_score

laplacian_threshold += threshold_jump_increase * threshold_jump_size
elif next_score <= threshold_score:
threshold_score = next_score

threshold_jump_increase *= -1
threshold_jump_size *= decay_const

laplacian_threshold += threshold_jump_increase * threshold_jump_size
decay_count += 1
save_threshold_and_score_chart(threshold_history, score_history)

acc = round(acc, 3)
precision = round(precision, 3)
threshold_score_rounded = round(threshold_score, 3)
print(
f"acc:{acc},precision:{precision},score:{threshold_score_rounded},table:{whole_background_dict}" +
f"thresh:{laplacian_threshold},jump_size:{threshold_jump_size}")
else:
print(f"table:{whole_background_dict},table_per_slide:{whole_background_dict_per_slide}" +
f"threshold:{laplacian_threshold},jump_size:{threshold_jump_size}")
else:
break
return laplacian_threshold


def save_threshold_and_score_chart(threshold_history, score_history):
fig_save_path = "laplacian_threshold_history_chart.jpeg"
plt.plot(range(len(threshold_history)), threshold_history)
plt.xlabel('Batch')
plt.ylabel('Laplacian threshold')
plt.savefig(fig_save_path)
plt.clf()

fig_save_path = "laplacian_threshold_score_history_chart.jpeg"
plt.plot(range(len(score_history)), score_history)
plt.xlabel('Batch')
plt.ylabel('Objective function - Sore')
plt.savefig(fig_save_path)
plt.clf()


if __name__ == '__main__':
image_lists = [
( # "('0', '100', '0')"
"./TCGA-BJ-A3F0-01A-01-TSA.728CE583-95BE-462B-AFDF-FC0B228DF3DE__3_masked.tiff",
"./TCGA-BJ-A3F0-01A-01-TSA.728CE583-95BE-462B-AFDF-FC0B228DF3DE__3.svs"
),
( # "('0', '100', '0')"
"./TCGA-DJ-A1QG-01A-01-TSA.04c62c21-dd45-49ea-a74f-53822defe097__2000_masked.tiff",
"./TCGA-DJ-A1QG-01A-01-TSA.04c62c21-dd45-49ea-a74f-53822defe097__2000.svs"
),
# ( # "('0', '100', '0')"
# "./TCGA-EL-A3ZQ-01A-01-TS1.344610D2-AB50-41C6-916E-FF0F08940BF1__2000_masked.tiff",
# "./TCGA-EL-A3ZQ-01A-01-TS1.344610D2-AB50-41C6-916E-FF0F08940BF1__2000.svs"
# ),
( # "('45', '55', '0')"
"./TCGA-ET-A39N-01A-01-TSA.C38FCE19-9558-4035-9F0B-AD05B9BE321D___198_masked.tiff",
"./TCGA-ET-A39N-01A-01-TSA.C38FCE19-9558-4035-9F0B-AD05B9BE321D___198.svs"
),
# ( # "('0', '40', '60')"
# "./TCGA-J8-A42S-01A-01-TSA.7B80CBEB-7B85-417E-AA0C-11C79DE40250__0_masked.tiff",
# "./TCGA-J8-A42S-01A-01-TSA.7B80CBEB-7B85-417E-AA0C-11C79DE40250__0.svs"
# ),
( # "('0', '90', '10')"
"./TCGA-ET-A39O-01A-01-TSA.3829C900-7597-4EA9-AFC7-AA238221CE69_7000_masked.tiff",
"./TCGA-ET-A39O-01A-01-TSA.3829C900-7597-4EA9-AFC7-AA238221CE69_7000.svs"
),
( # "('100', '0', '0')"
"./TCGA-EL-A4K7-11A-01-TS1.C08B59AA-87DF-4ABB-8B70-25FEF9893C7F__70_masked.tiff",
"./TCGA-EL-A4K7-11A-01-TS1.C08B59AA-87DF-4ABB-8B70-25FEF9893C7F__70.svs"
),
( # "('100', '0', '0')"
"./TCGA-EL-A3TB-11A-01-TS1.6E0966C9-1552-4B30-9008-8ACF737CA8C3__2000_masked.tiff",
"./TCGA-EL-A3TB-11A-01-TS1.6E0966C9-1552-4B30-9008-8ACF737CA8C3__2000.svs"
),
]

learned_threshold = update_and_find_best_threshold(500, learn_threshold_and_log_cf_matrix_per_patch=True)
update_and_find_best_threshold(learned_threshold, learn_threshold_and_log_cf_matrix_per_patch=False)

# Start with 500 with jump size 120 and decay 0.85
# table:{'TP': 15018, 'FP': 412, 'TN': 66898, 'FN': 2389},
# table_per_slide:[
# {'TP': 460, 'FP': 0, 'TN': 19618, 'FN': 1426},
# {'TP': 4624, 'FP': 126, 'TN': 14100, 'FN': 226},
# {'TP': 1138, 'FP': 4, 'TN': 6671, 'FN': 492},
# {'TP': 7615, 'FP': 92, 'TN': 20871, 'FN': 234},
# {'TP': 78, 'FP': 18, 'TN': 1880, 'FN': 4},
# {'TP': 1103, 'FP': 172, 'TN': 3758, 'FN': 7}
# ]
# threshold:298.86314585743395,jump_size:120

+ 6
- 0
database_crawlers/image_patcher/test_image_patcher/nci_svs_and_masks/run_patch_evaluator_and_visualizer.sh View File

export PYTHONPATH="${PYTHONPATH}:../../../../";
export PYTHONPATH="${PYTHONPATH}:../../../";
export PYTHONPATH="${PYTHONPATH}:../../";
export PYTHONPATH="${PYTHONPATH}:../";
export PYTHONPATH="${PYTHONPATH}:./";
python evaluate_image_patcher_and_visualize.py;

+ 4
- 0
database_crawlers/image_patcher/test_image_patcher/run_patch_distribution.sh View File

export PYTHONPATH="${PYTHONPATH}:../../";
export PYTHONPATH="${PYTHONPATH}:../";
export PYTHONPATH="${PYTHONPATH}:./";
python patcher_distribution.py;

+ 18
- 0
database_crawlers/national_cancer_institute/cells_chart.py View File

import pathlib

import matplotlib.pyplot as plt

from national_cancer_institute.read_xml_file import get_slide_info_from_bcr_xml

if __name__ == '__main__':
data_dir = "data/"
slide_infos = {}
for xml_path in pathlib.Path(data_dir).glob("**/*.xml"):
slide_infos.update(get_slide_info_from_bcr_xml(str(xml_path)))
cell_percents = [int(item[1]) for item in slide_infos.values() if int(item[2]) == 0]
print("tumor:", len([i for i in cell_percents if i == 100]))
print("normal", len([i for i in cell_percents if i == 0]))
print([i for i in cell_percents if i != 0 and i != 100])
print(len(cell_percents))
plt.hist(cell_percents, bins=150)
plt.savefig("tumor_cell_distribution.jpeg")

BIN
database_crawlers/national_cancer_institute/gdc-client View File


+ 1147
- 0
database_crawlers/national_cancer_institute/gdc_manifest_20220701_140911.txt
File diff suppressed because it is too large
View File


+ 65
- 0
database_crawlers/national_cancer_institute/patch_distribution.py View File

import concurrent.futures
import os
import pathlib

import matplotlib.pyplot as plt
from tqdm import tqdm

from config import Config
from image_patcher import ImageAndSlidePatcher


def save_patch_distribution(database_path):
def patch_image(image_path):
try:
image_path = str(image_path)
file_name = ImageAndSlidePatcher._get_file_name_from_path(image_path)
slide_id = file_name.split(".")[0]
slide_patch_dir = os.path.join(patch_dir, slide_id)

if ImageAndSlidePatcher._get_extension_from_path(image_path) in [".tiff", ".tif", ".svs"]:
zarr_object = ImageAndSlidePatcher._zarr_loader(image_path)
total_counts = ImageAndSlidePatcher._get_number_of_initial_frags(zarr_object=zarr_object)
else:
jpeg_image = ImageAndSlidePatcher._jpeg_loader(image_path)
jpeg_image = ImageAndSlidePatcher.ask_image_scale_and_rescale(jpeg_image)
total_counts = ImageAndSlidePatcher._get_number_of_initial_frags(zarr_object=jpeg_image)
if os.path.exists(slide_patch_dir):
fragment_id = len([i for i in pathlib.Path(slide_patch_dir).glob("*.jpeg")])
return fragment_id, total_counts
except Exception as e:
print("error")
print(e)
raise e

res_patch_counts = []

data_dir = os.path.join(database_path, "data")

patch_dir = os.path.join(database_path, "patches")

with concurrent.futures.ThreadPoolExecutor(max_workers=Config.workers) as executor:
image_paths = pathlib.Path(data_dir).glob("**/*.svs")
image_paths = [i for i in image_paths]
print()
for res in tqdm(executor.map(patch_image, image_paths), total=len(image_paths)):
if res:
frags, total = res
res_patch_counts.append(res)
print(res_patch_counts)
plt.hist([i[0] for i in res_patch_counts], bins=100)
plt.xlabel("Patch per slide")
plt.ylabel("Frequency")
plt.savefig("patch_distribution.jpeg")
plt.clf()

plt.hist([round(i[0] / (i[1] + 0.00001), 5) * 100 for i in res_patch_counts], bins=100)
plt.xlabel("Patch per slide percent")
plt.ylabel("Frequency")
plt.savefig("patch_percent_distribution.jpeg")
plt.clf()


if __name__ == '__main__':
database_directory = "../"
save_patch_distribution(os.path.join(database_directory, "national_cancer_institute"))

+ 26
- 0
database_crawlers/national_cancer_institute/read_xml_file.py View File

from xml.dom import minidom


def get_slide_info_from_bcr_xml(xml_path):
file = minidom.parse(xml_path)
patient = file.childNodes[0].getElementsByTagName("bio:patient")[0]
data_dict = {}
try:
for i in range(10):
percent_tumor_cells = patient.getElementsByTagName("bio:percent_tumor_cells")[i].childNodes[
0].data.strip()
percent_normal_cells = patient.getElementsByTagName("bio:percent_normal_cells")[i].childNodes[
0].data.strip()
percent_stormal_cells = patient.getElementsByTagName("bio:percent_stromal_cells")[i].childNodes[
0].data.strip()
slide_barcode = patient.getElementsByTagName("shared:bcr_slide_barcode")[i].childNodes[0].data.strip()
data_dict[slide_barcode] = (percent_normal_cells, percent_tumor_cells, percent_stormal_cells)
except Exception as e:
pass
return data_dict


if __name__ == '__main__':
path = "../national_cancer_institute/data/1aea8f2a-f809-4f19-bed3-1365e9aab33b/nationwidechildrens.org_biospecimen.TCGA-BJ-A28X.xml"
res = get_slide_info_from_bcr_xml(path)
print(res)

+ 4
- 0
database_crawlers/national_cancer_institute/run_cell_distribution.sh View File

export PYTHONPATH="${PYTHONPATH}:../../";
export PYTHONPATH="${PYTHONPATH}:../";
export PYTHONPATH="${PYTHONPATH}:./";
python cells_chart.py;

+ 5
- 0
database_crawlers/national_cancer_institute/run_patch_distribution.sh View File

export PYTHONPATH="${PYTHONPATH}:../../";
export PYTHONPATH="${PYTHONPATH}:../";
export PYTHONPATH="${PYTHONPATH}:../image_patcher";
export PYTHONPATH="${PYTHONPATH}:./";
python patch_distribution.py;

+ 29
- 0
database_crawlers/papsociaty/duplicate_image.txt View File

1811210076455461803,BENIGN-BTNpap-hyperplZC04-10370x4_0
7845783054062606488,BENIGN-Dan_Colloid1_0
7637345021064072354,BENIGN-FTH468BTN_sheets_follicles_1_0
814792220375115888,BENIGN-GD40T
2888555316355101926,BENIGN-gd-fn20_0
-8312464544528256569,BENIGN-gd10p_0
-2254510488499374008,BENIGN-gd20g_0
2271447961045108683,MALIGNANT-ATC20L_0
7269337178939971574,MALIGNANT-ATC20b_0
-9073528708751422131,MALIGNANT-ATC40BB_0
-8571937763202005072,MALIGNANT-ATC40FC_0
-8610450256188951874,MALIGNANT-ATC40J_0
-7854679564093375561,MALIGNANT-ATC40p_0
-4977783033606377395,MALIGNANT-ATC5A_0
1342853015262631578,MALIGNANT-Atc63w_0
431837691807971266,MALIGNANT-C03-54313A_0
8004646368797684873,MALIGNANT-C03-54313B_0
8041832687277297518,MALIGNANT-C03-54313C_0
6449521961463025237,MALIGNANT-C03-54313D_0
-8688277356782858138,MALIGNANT-C03-54313E_0
-9068734717515564721,MALIGNANT-C03-54313F_0
-5752120463987418399,MALIGNANT-C03-54313G_0
8889530878367993817,MALIGNANT-Ed82C_1_0
2934951614446666978,"MALIGNANT-Follicular_neoplasm2,_low_power,_confirmed_FVPTC_DQ_SM_0"
-1541914789037593200,"MALIGNANT-Pap_CA,_excellent_inclusion,_DQ_0"
-1717557305576530323,"MALIGNANT-Pap_CA,_powdery_nuclei,_small_nucleoli_and_small_inclusion,_PAP_hp_0"
947862089311894766,"MALIGNANT-Papillary_CA,_giant_cell,_DQ_lp_0"
-8520432463383972777,"MALIGNANT-Papillary_CA,_giant_cell_and_tumor_sheets,_PAP_lp2_0"
-4759582710547943524,"MALIGNANT-Papillary_CA,_high_cellularity_3,_PAP_lp_0"

+ 13
- 0
database_crawlers/papsociaty/remove_duplicate_patches.py View File

import os
import shutil
if __name__ == '__main__':
duplicate_info_file_path = "duplicate_image.txt"
with open(duplicate_info_file_path, "r") as file:
for line in file.readlines():
folder_id = line.split(",")[0]
folder_path = os.path.join("./patches", folder_id)
if os.path.exists(folder_path):
shutil.rmtree(folder_path)
print("deleted")
else:
print("no")

+ 1319
- 0
database_crawlers/rescale console log for papsociety and stanford.txt
File diff suppressed because it is too large
View File


+ 64
- 0
database_crawlers/stanford_tissue_microarray/database_crawler.py View File

import json
from urllib.parse import urlparse
from urllib.request import urlretrieve

import requests
from bs4 import BeautifulSoup

from database_crawlers.web_stain_sample import WebStainImage, StainType


class StanfordTissueMicroArrayStainSample(WebStainImage):

def __init__(self, database_name, image_id, image_web_label, report, stain_type, is_wsi):
super().__init__(database_name, image_id, image_web_label, report, stain_type, is_wsi)

def get_slide_view_url(self):
return f"https://storage.googleapis.com/jpg.tma.im/{self.image_id}"

def get_file_name(self):
image_raw_id = self.image_id.replace("/", "_")
image_raw_id = ".".join(image_raw_id.split(".")[:len(image_raw_id.split(".")) - 1])
return self.save_path + image_raw_id

def get_relative_image_path(self):
return self.get_file_name() + ".jpeg"

def get_relative_json_path(self):
return self.get_file_name() + ".json"

def crawl_image_save_jpeg(self):
urlretrieve(self.get_slide_view_url(), self.get_relative_image_path())
json_object = json.dumps(self.to_json())
with open(self.get_relative_json_path(), "w") as outfile:
outfile.write(json_object)


class StanfordTissueMicroArraySlideProvider:
page_link = "https://tma.im/cgi-bin/selectImages.pl?organ=thyroid"
database_name = "StanfordTissueMicroArray"
stain_type = StainType.UNKNOWN
is_wsi = False

@classmethod
def get_web_stain_samples(cls):
payload = {'250 small images': '250 small images'}
files = []
headers = {
'Cookie': 'DAD_ATTEMPTS=0; DAD_SID=36d77eb69e009b1cf1ebc9c3d7866546; DAD_USERID=WORLD'
}
html_text = requests.post(cls.page_link, files=files, headers=headers, data=payload).content.decode("utf-8")
soup = BeautifulSoup(html_text, 'html.parser')
search_results = soup.find_all("div", {"class": "iDiv0", "style": "width: 86px; height: 260px;"})
for result_item in search_results:
image_url = result_item.find("a", {"target": "_blank"}).attrs['href']
image_id = "/".join(urlparse(image_url).path.strip("/").split("/")[1:])
image_web_label = list(result_item.find_all("p", {"class": "iDiv1"}))[-2].text
yield StanfordTissueMicroArrayStainSample(cls.database_name, image_id, image_web_label, None,
cls.stain_type, cls.is_wsi)


if __name__ == '__main__':
for slide in StanfordTissueMicroArraySlideProvider.get_web_stain_samples():
print(slide.image_id, slide.image_web_label, slide.get_slide_view_url())
slide.crawl_image_save_jpeg()

+ 87
- 0
database_crawlers/utils.py View File

import concurrent.futures
import concurrent.futures
import time
from urllib.error import HTTPError
from urllib.request import urlretrieve

from torch.utils.data import IterableDataset
from tqdm import tqdm

from config import Config


def find_in_log_n(start, end, func, bias=0.3):
if end - start <= 1:
return start
mid = int(start * (1 - bias) + end * bias)
if start == mid:
mid += 1
if func(mid):
return find_in_log_n(mid, end, func)
else:
return find_in_log_n(start, mid, func)


def fetch_tile_content(tile_url, retry=15):
for i in range(retry):
try:
image_path = urlretrieve(tile_url)[0]
with open(image_path, "rb") as file:
return file.read()
except Exception as e:
print("e", end="|")
time.sleep(2 ** (0.3 * (i + 1)))
if i == retry - 1:
if input("continue") == "y":
return fetch_tile_content(tile_url, retry)
raise e
raise HTTPError("Not able for fetch image tile", code=500, msg="", hdrs={}, fp=None)


def download_urls_in_thread(url_and_index_list):
def download(args):
url, index = args
file_content = fetch_tile_content(url)
return file_content, index

with concurrent.futures.ThreadPoolExecutor(max_workers=Config.workers) as executor:
for tile, i in tqdm(executor.map(download, url_and_index_list), total=len(url_and_index_list)):
yield tile, i


def _get_alignment_sore_and_percent(seq1, seq2, match_score=2, mismatch_score=-1, gap_score=-1):
from alignment.sequence import Sequence
from alignment.sequencealigner import SimpleScoring, GlobalSequenceAligner
from alignment.vocabulary import Vocabulary
a = Sequence(seq1)
b = Sequence(seq2)

v = Vocabulary()
aEncoded = v.encodeSequence(a)
bEncoded = v.encodeSequence(b)

scoring = SimpleScoring(match_score, mismatch_score)
aligner = GlobalSequenceAligner(scoring, gap_score)
score = aligner.align(aEncoded, bEncoded, backtrace=False)

return score


def get_normalized_score(seq1, seq2):
score = _get_alignment_sore_and_percent(seq1, seq2)
return score / (len(seq2) + len(seq1))


class DatasetWithGenerator(IterableDataset):
def __init__(self, generator):
self.generator = generator

def __iter__(self):
return self.generator


if __name__ == '__main__':
import math

print(math.log2(1000 * 1000))
print(find_in_log_n(0, 100, lambda x: x <= 76))

+ 296
- 0
database_crawlers/web_stain_sample.py View File

import enum
import json
import time
from io import BytesIO
from urllib.request import Request, urlopen

import cv2
import numpy as np
from PIL import Image
from tifffile import TiffWriter

from database_crawlers.utils import find_in_log_n, fetch_tile_content, download_urls_in_thread


class StainType(enum.Enum):
H_AND_E = 0, "H&E"
UNKNOWN = 1, "UNKNOWN"


class ThyroidCancerLevel(enum.Enum):
UNKNOWN = -1, "UNKNOWN"
MALIGNANT = 0, "MALIGNANT"
BENIGN = 1, "BENIGN"

@staticmethod
def get_thyroid_level_from_diagnosis_label(label: str):
label = label.lower()
if "malignant" in label:
return ThyroidCancerLevel.MALIGNANT
elif "benign" in label:
return ThyroidCancerLevel.BENIGN
else:
return ThyroidCancerLevel.UNKNOWN


class ThyroidType(enum.Enum):
UNKNOWN = -1, "UNKNOWN"
NORMAL = 0, "NORMAL"
PAPILLARY_CARCINOMA = 1, "PAPILLARY_CARCINOMA"
GRAVES_DISEASE = 2, "GRAVES_DISEASE"
NODULAR_GOITER = 3, "NODULAR_GOITER"
HASHIMOTO_THYROIDITIS = 4, "HASHIMOTO_THYROIDITIS"
FOLLICULAR_CARCINOMA = 5, "FOLLICULAR_CARCINOMA"
FOLLICULAR_ADENOMA = 6, "FOLLICULAR_ADENOMA"
COLLOID_GOITER = 7, "COLLOID_GOITER"

@staticmethod
def get_thyroid_type_from_diagnosis_label(label: str):
label = label.lower()
if "normal" in label:
return ThyroidType.NORMAL
elif "papillary" in label:
return ThyroidType.PAPILLARY_CARCINOMA
elif "grave" in label:
return ThyroidType.GRAVES_DISEASE
elif "nodular" in label and "goiter" in label:
return ThyroidType.NODULAR_GOITER
elif "hashimoto" in label:
return ThyroidType.HASHIMOTO_THYROIDITIS
elif "follicular" in label:
if "adenoma" in label:
return ThyroidType.FOLLICULAR_ADENOMA
else:
return ThyroidType.FOLLICULAR_CARCINOMA
elif "colloid" in label and "goiter" in label:
return ThyroidType.COLLOID_GOITER
else:
return ThyroidType.UNKNOWN


class WebStainImage:
save_path = "data/"

def __init__(self, database_name, image_id, image_web_label, report, stain_type, is_wsi):
self.database_name = database_name
self.image_id = image_id
self.image_web_label = image_web_label
self.report = report
self.stain_type = stain_type
self.is_wsi = is_wsi

def to_json(self):
return {"database_name": self.database_name,
"image_id": self.image_id,
"image_web_label": self.image_web_label,
"image_class_label": self.image_class_label,
"report": self.report,
"stain_type": self.stain_type.value[1],
"is_wsi": self.is_wsi}

@staticmethod
def sorted_json_keys():
return ["database_name",
"image_id",
"image_web_label",
"image_class_label",
"report",
"stain_type",
"is_wsi"]

@property
def image_class_label(self):
return ThyroidType.get_thyroid_type_from_diagnosis_label(self.image_web_label).value[1]

def get_slide_view_url(self):
raise NotImplemented("get_slide_view_url")

def crawl_image_save_jpeg_and_json(self):
raise NotImplemented("crawl_image_get_jpeg")

def _get_file_path_name(self):
return self.save_path + self.image_id

def _get_relative_image_path(self):
return self._get_file_path_name() + ".jpeg"

def _get_relative_tiff_image_path(self):
return self._get_file_path_name() + ".tiff"

def _get_relative_json_path(self):
return self._get_file_path_name() + ".json"

def _save_json_file(self):
json_object = json.dumps(self.to_json())
with open(self._get_relative_json_path(), "w") as outfile:
outfile.write(json_object)


class WebStainWSI(WebStainImage):
def __init__(self, database_name, image_id, image_web_label, report, stain_type, is_wsi):
super().__init__(database_name, image_id, image_web_label, report, stain_type, is_wsi)

def _get_tile_url(self, zoom, partition=None, i=None, j=None):
raise NotImplemented("_get_tile_url")

def _generate_tile_urls(self):
raise NotImplemented("generate tile urls")

def find_best_zoom(self):
return 0

def _find_first_tile_width(self):
image_content = fetch_tile_content(self._get_tile_url(self.find_best_zoom(), partition=0, i=0, j=0))
img = Image.open(BytesIO(image_content))
return img.size[0], img.size[1]

def _fetch_all_tiles(self):
batch = []
index = 0
for url in self._generate_tile_urls():
batch.append((url, index))
# DONE
index += 1
# download last batch
if len(batch) != 0:
for content, downloaded_index in download_urls_in_thread(batch):
yield content, downloaded_index
print("Slide download tiles done!!!")

def crawl_image_save_jpeg_and_json(self):
raise NotImplemented("crawl_image_save_jpeg_and_json")


class WebStainWSIOneDIndex(WebStainWSI):
def __init__(self, database_name, image_id, image_web_label, report, stain_type, is_wsi):
super().__init__(database_name, image_id, image_web_label, report, stain_type, is_wsi)
self.last_partition = None

def _find_last_partition(self):
print("Finding last partition: ", end="")

def func(partition, retry=3):
print(partition, end="")
for i in range(retry):
try:
request = Request(self._get_tile_url(self.find_best_zoom(), partition=partition), method='HEAD')
resp = urlopen(request)
headers = resp.info()
print("<", end=", ")
return True
except Exception as e:
print("e", end="")
time.sleep(2 ** (0.1 * (i + 1)))
print(">", end=", ")
return False

return find_in_log_n(0, 1000 * 1000, func)

def _generate_tile_urls(self):
for partition in range(self.last_partition + 1):
yield self._get_tile_url(self.find_best_zoom(), partition=partition)

def crawl_image_save_jpeg_and_json(self):
def generator():
while True:
if first_temp_rows:
yield first_temp_rows[0]
del first_temp_rows[0]
else:
res = next(content_fetcher, -1)
if res == -1:
break
img = cv2.imdecode(np.frombuffer(res[0], np.uint8), -1)
if len(img.shape) == 2:
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
yield img

first_image_width, first_image_height = self._find_first_tile_width()
first_temp_rows = []
column_tiles, row_tiles = None, None
self.last_partition = self._find_last_partition()
content_fetcher = self._fetch_all_tiles()
with TiffWriter(self._get_relative_tiff_image_path(), bigtiff=True) as tif:
while column_tiles is None:
content, index = content_fetcher.__next__()
image_array = cv2.imdecode(np.frombuffer(content, np.uint8), cv2.IMREAD_COLOR)
first_temp_rows.append(image_array)
if image_array.shape[1] != first_image_width:
column_tiles = index + 1
row_tiles = (self.last_partition + 1) // column_tiles
shape = (first_image_height * row_tiles, first_image_width * column_tiles, 3)
tif.write(generator(), subfiletype=1, tile=(first_image_height, first_image_width), shape=shape,
dtype=np.uint8,
compression='JPEG', # TODO
photometric='rgb')

"""
Save json file
"""
self._save_json_file()


class WebStainWSITwoDIndex(WebStainWSI):
def __init__(self, database_name, image_id, image_web_label, report, stain_type, is_wsi):
super().__init__(database_name, image_id, image_web_label, report, stain_type, is_wsi)
self.last_i = None
self.last_j = None

def _generate_tile_urls(self):
for j in range(self.last_j + 1):
for i in range(self.last_i + 1):
yield self._get_tile_url(self.find_best_zoom(), i=i, j=j)

def _find_last_i_and_j(self):
def func(i, j, retry=3):
print(f"{i}-{j}", end="")
for r in range(retry):
try:
request = Request(self._get_tile_url(self.find_best_zoom(), i=i, j=j), method='HEAD')
resp = urlopen(request)
headers = resp.info()
print("<", end=", ")
return True
except Exception as e:
print("e", end="")
time.sleep(2 ** (0.1 * (r + 1)))
print(">", end=", ")
return False

print("Finding last i: ", end="")
i_func = lambda i: func(i=i, j=0)
last_i = find_in_log_n(0, 1000, i_func)
print("\nFinding last j: ")
j_func = lambda j: func(i=0, j=j)
last_j = find_in_log_n(0, 1000, j_func)
return last_i, last_j

def crawl_image_save_jpeg_and_json(self):
def generator():
while True:
res = next(content_fetcher, -1)
if res == -1:
break
res = cv2.imdecode(np.frombuffer(res[0], np.uint8), -1)
if max(res.shape) >= 260:
raise Exception(f"warning shape: {res.shape}")
res = cv2.resize(res, (min(res.shape[1], 256), min(res.shape[0], 256)))
yield res

first_image_width = 256
first_image_height = 256
self.last_i, self.last_j = self._find_last_i_and_j()
content_fetcher = self._fetch_all_tiles()
with TiffWriter(self._get_relative_tiff_image_path(), bigtiff=True) as tif:
shape = (first_image_height * (self.last_j + 1), first_image_width * (self.last_i + 1), 3)
tif.write(generator(), subfiletype=1,
tile=(first_image_height, first_image_width),
shape=shape,
dtype=np.uint8,
compression='JPEG', # TODO
photometric='rgb')

"""
Save json file
"""
self._save_json_file()

+ 20
- 0
datasets_sample_view/convert_to_jpeg.py View File

from PIL import Image
import zarr
import tifffile

def convert_tif_to_jpeg():
input_address = "data/test/1672.tiff"
# outfile = "data/test/out.jpeg"
outfile = "data/test/out.zarr"
image_zarr = tifffile.imread(input_address, aszarr=True, key=0)
zarr_image = zarr.open(image_zarr, mode='r')
zarr.save(outfile, zarr_image)
## RAM PROBLEM
# im = Image.open()
# out = im.convert("RGB")
# out.save(outfile, "JPEG", quality=90)


if __name__ == '__main__':
Image.MAX_IMAGE_PIXELS = 1000 * 1000 * 256 * 256
convert_tif_to_jpeg()

+ 50
- 0
datasets_sample_view/dataset_sample_view.py View File

# import libtiff
# import pytiff
import cv2
import tifffile


def show_tif_image(address, name, key=0, w_from=0, h_from=0, size=700, whole_image=False):
import zarr
image_zarr = tifffile.imread(address, aszarr=True, key=key)
zarr = zarr.open(image_zarr, mode='r')
if not whole_image:
image_frag = zarr[w_from:min(w_from + size, zarr.shape[0]), h_from:min(h_from + size, zarr.shape[1])]
else:
image_frag = zarr[0:zarr.shape[0], 0:zarr.shape[1]]
cv2.imshow(f"name:{name} - shape:{image_frag.shape} - page:{key}", image_frag)
print(f"name: {name}, shape: {zarr.shape}")
image_zarr.close()


def show_CAMELYON16_sample_view():
# show_tif_image('data/CAMELYON16/tumor_084.tif', "CAMELYON16", key=7)
show_tif_image('data/CAMELYON16/tumor_084.tif', "CAMELYON16", key=0, w_from=10000, h_from=50000)


def show_CAMELYON17_sample_view():
show_tif_image('data/CAMELYON17/patient_083_node_4.tif', "CAMELYON17", key=7)


def show_Papsociety_sample_view():
image_frag = cv2.imread(
'data/Papsociety/Follicular_neoplasm2,_low_power,_confirmed_FVPTC_DQ_SM.jpg')
cv2.imshow(f"Papsociety - {image_frag.shape}", image_frag)


def show_test(name, ):
# show_tif_image('data/CAMELYON16/tumor_084.tif', "CAMELYON16", key=7)
show_tif_image('data/test/1272.tiff', name, key=0, w_from=1300, h_from=0, size=1000)



if __name__ == '__main__':
# show_CAMELYON16_sample_view()
# show_CAMELYON17_sample_view()
# show_Papsociety_sample_view()
show_tif_image('data/test/1272.tiff', "1", key=0, w_from=1000, h_from=100, size=1000)
show_tif_image('data/test/1272.tiff', "2", key=0, w_from=1000, h_from=1000, size=1000)

while True:
if cv2.waitKey(1) == ord('q'):
break

+ 180
- 0
requirements.txt View File

absl-py==1.0.0
aiohttp==3.8.1
aiosignal==1.2.0
alignment==1.0.10
appdirs==1.4.4
argon2-cffi==20.1.0
asgiref==3.2.10
astunparse==1.6.3
async-generator==1.10
async-timeout==4.0.2
attrs==21.2.0
backcall==0.2.0
bleach==3.3.0
blis==0.7.5
cachetools==4.2.4
catalogue==2.0.6
certifi==2021.10.8
cffi==1.14.5
charset-normalizer==2.0.8
click==8.0.3
colorama==0.4.4
convertapi==1.4.0
cryptography==3.4.7
cycler==0.11.0
cymem==2.0.6
Cython==0.29.23
decorator==5.0.9
defusedxml==0.7.1
distlib==0.3.2
dj-database-url==0.5.0
Django==3.1.2
django-crispy-forms==1.9.2
django-heroku==0.3.1
django-rest==0.8.7
djangorestframework==3.13.1
djangorestframework-simplejwt==5.0.0
entrypoints==0.3
et-xmlfile==1.1.0
factory-boy==3.2.1
Faker==12.3.0
filelock==3.0.12
flatbuffers==2.0
fonttools==4.28.2
frozenlist==1.3.0
gast==0.4.0
gensim==4.1.2
google-auth==2.3.3
google-auth-oauthlib==0.4.6
google-pasta==0.2.0
grpcio==1.42.0
gunicorn==20.0.4
h5py==3.6.0
hazm==0.7.0
huggingface-hub==0.6.0
idna==3.3
importlib-metadata==4.8.2
ipykernel==5.5.5
ipython==7.24.1
ipython-genutils==0.2.0
ipywidgets==7.6.3
jedi==0.18.0
Jinja2==3.0.1
joblib==1.0.1
jsonschema==3.2.0
jupyter==1.0.0
jupyter-client==6.1.12
jupyter-console==6.4.0
jupyter-core==4.7.1
jupyterlab-pygments==0.1.2
jupyterlab-widgets==1.0.0
keras==2.7.0
Keras-Preprocessing==1.1.2
kiwisolver==1.3.2
langcodes==3.3.0
libclang==12.0.0
libtiff==0.4.2
Markdown==3.3.6
MarkupSafe==2.0.1
matplotlib==3.5.0
matplotlib-inline==0.1.2
mistune==0.8.4
multidict==6.0.2
murmurhash==1.0.6
nbclient==0.5.3
nbconvert==6.0.7
nbformat==5.1.3
nest-asyncio==1.5.1
nltk==3.3
notebook==6.4.0
numpy==1.20.3
oauthlib==3.1.1
opencv-python==4.5.2.54
openpyxl==3.0.7
opt-einsum==3.3.0
packaging==20.9
pandas==1.2.4
pandocfilters==1.4.3
parso==0.8.2
pathy==0.6.1
pickleshare==0.7.5
Pillow==8.4.0
preshed==3.0.6
prometheus-client==0.11.0
prompt-toolkit==3.0.18
protobuf==3.19.1
psycopg2==2.8.6
pyaes==1.6.1
pyaml==21.10.1
pyasn1==0.4.8
pyasn1-modules==0.2.8
pycparser==2.20
pydantic==1.8.2
Pygments==2.9.0
PyJWT==2.3.0
pynput==1.7.5
pyOpenSSL==20.0.1
pyparsing==2.4.7
pyrsistent==0.17.3
pyTelegramBotAPI==4.4.0
python-dateutil==2.8.1
python-decouple==3.6
pytz==2020.1
PyWavelets==1.1.1
pywin32==301
pywinpty==1.1.1
PyYAML==6.0
pyzmq==22.1.0
qtconsole==5.1.0
QtPy==1.9.0
regex==2022.4.24
requests==2.26.0
requests-oauthlib==1.3.0
rsa==4.8
scikit-learn==0.24.2
scipy==1.7.0
Send2Trash==1.5.0
setuptools-scm==6.3.2
six==1.16.0
sklearn==0.0
smart-open==5.2.1
spacy==3.2.1
spacy-legacy==3.0.8
spacy-loggers==1.0.1
sqlparse==0.4.1
srsly==2.4.2
Telethon==1.24.0
tensorboard==2.7.0
tensorboard-data-server==0.6.1
tensorboard-plugin-wit==1.8.0
tensorflow==2.7.0
tensorflow-estimator==2.7.0
tensorflow-io-gcs-filesystem==0.22.0
termcolor==1.1.0
terminado==0.10.0
testpath==0.5.0
thinc==8.0.13
threadpoolctl==2.2.0
tokenizers==0.12.1
tomli==1.2.2
torch==1.10.1
torchtext==0.11.1
torchvision==0.11.1
tornado==6.1
tqdm==4.62.3
traitlets==5.0.5
transformers==4.19.2
typer==0.4.0
typing_extensions==4.0.1
urllib3==1.26.7
virtualenv==20.4.7
wasabi==0.9.0
wcwidth==0.2.5
webencodings==0.5.1
Werkzeug==2.0.2
whitenoise==5.2.0
widgetsnbextension==3.5.1
wrapt==1.13.3
xlrd==2.0.1
yarl==1.7.2
zipp==3.6.0

+ 21
- 0
utils.py View File

import itertools

import cv2


def show_and_wait(img, name="img", wait=True, save=False):
cv2.imshow(name, img)
if wait:
while cv2.waitKey() != ord('q'):
continue
cv2.destroyAllWindows()
if save:
cv2.imwrite(f"{name}.jpeg", img)


def check_if_generator_is_empty(generator):
try:
first = next(generator)
except StopIteration:
return None
return itertools.chain([first], generator)

Loading…
Cancel
Save