Commit 5d782990 authored by mibaumgartner's avatar mibaumgartner
Browse files

More guides

parent 95642b36
...@@ -13,6 +13,7 @@ from nndet.io.prepare import maybe_split_4d_nifti, create_test_split ...@@ -13,6 +13,7 @@ from nndet.io.prepare import maybe_split_4d_nifti, create_test_split
from nndet.io import get_case_ids_from_dir, load_json, save_yaml from nndet.io import get_case_ids_from_dir, load_json, save_yaml
from nndet.utils.check import env_guard from nndet.utils.check import env_guard
from nndet.utils.info import maybe_verbose_iterable
def process_case(case_id, def process_case(case_id,
...@@ -118,7 +119,7 @@ def main(): ...@@ -118,7 +119,7 @@ def main():
case_ids = sorted([c for c in case_ids if c]) case_ids = sorted([c for c in case_ids if c])
logger.info(f"Found {len(case_ids)} for preparation.") logger.info(f"Found {len(case_ids)} for preparation.")
for cid in case_ids: for cid in maybe_verbose_iterable(case_ids):
process_case(cid, process_case(cid,
source_data_dir, source_data_dir,
source_labels_dir, source_labels_dir,
......
...@@ -15,6 +15,7 @@ from pathlib import Path ...@@ -15,6 +15,7 @@ from pathlib import Path
from nndet.io.load import save_json, load_json from nndet.io.load import save_json, load_json
from nndet.io.paths import subfiles from nndet.io.paths import subfiles
from nndet.utils.check import env_guard from nndet.utils.check import env_guard
from nndet.utils.info import maybe_verbose_iterable
def prepare_case(case_dir: Path, target_dir: Path, df: pd.DataFrame): def prepare_case(case_dir: Path, target_dir: Path, df: pd.DataFrame):
...@@ -121,7 +122,7 @@ def main(): ...@@ -121,7 +122,7 @@ def main():
case_dirs = [x for x in data_dir.iterdir() if x.is_dir()] case_dirs = [x for x in data_dir.iterdir() if x.is_dir()]
df = pd.read_csv(source_data_dir / 'characteristics.csv', sep=';') df = pd.read_csv(source_data_dir / 'characteristics.csv', sep=';')
for cd in case_dirs: for cd in maybe_verbose_iterable(case_dirs):
prepare_case(cd, target_dir, df) prepare_case(cd, target_dir, df)
# TODO download custom split file # TODO download custom split file
......
...@@ -45,8 +45,10 @@ def main(): ...@@ -45,8 +45,10 @@ def main():
meta = { meta = {
"name": "CADA", "name": "CADA",
"task": "Task017_CADA", "task": "Task017_CADA",
"target_class": None, "target_class": None,
"test_labels": False, "test_labels": False,
"labels": {"0": "aneurysm"}, "labels": {"0": "aneurysm"},
"modalities": {"0": "CT"}, "modalities": {"0": "CT"},
"dim": 3, "dim": 3,
......
# ProstateX
**Disclaimer**: We are not the host of the data.
Please make sure to read the requirements and usage policies of the data and **give credit to the authors of the dataset**!
Please read the information from the homepage carefully and follow the rules and instructions provided by the original authors when using the data.
- Data: https://wiki.cancerimagingarchive.net/display/Public/SPIE-AAPM-NCI+PROSTATEx+Challenges
- Masks: https://github.com/rcuocolo/PROSTATEx_masks
## Setup
0. Follow the installation instructions of nnDetection and create a data directory name `Task021_ProstateX`.
1. Download the data and labels and place them in the following structure:
```text
{det_data}
Task021_ProstateX
raw
ktrains
ProstateX
ProstateX-TrainingLesionInformationv2
rcuocolo-PROSTATEx_masks-e344452
```
We used the masks from the git hash e3444521e70cd5e8d405f4e9a6bc08312df8afe7 for our experiments.
For training only the T2 masks and T2,ADC and bVal high were used for training (no KTrains).
If you intend to use the Ktrains sequence, simply add it to the `dataset.json` file, the data is already prepared by the script.
2. Run `python prepare.py` in `projects / Task021_ProstateX / scripts` of the nnDetection repository.
The data is now converted to the correct format and the instructions from the nnDetection README can be used to train the networks.
Note: Since ProstateX only contains a fairly small number of clinically significant lesions and we used a 30% test split, we observed a fairly high variance in the performance of our runs.
import os
import sys
import traceback
from itertools import repeat
from multiprocessing import Pool
from pathlib import Path
import pandas as pd
import SimpleITK as sitk
from nndet.io.prepare import create_test_split
from loguru import logger
from nndet.utils.check import env_guard
from nndet.io import save_json, save_yaml
from nndet.io.itk import load_sitk, load_sitk_as_array, copy_meta_data_itk
from nndet.utils.info import maybe_verbose_iterable
def load_dicom_series_sitk(p):
reader = sitk.ImageSeriesReader()
dicom_names = reader.GetGDCMSeriesFileNames(str(p))
reader.SetFileNames(dicom_names)
return reader.Execute()
def prepare_case(case_id,
data_dirs,
ktrans_dirs,
t2_masks,
df_labels,
df_masks,
data_target,
label_target,
):
try:
logger.info(f"Preparing {case_id}")
tmp_dir = data_dirs / case_id
_dirs = [f for f in tmp_dir.iterdir() if f.is_dir()]
assert len(_dirs) == 1
data_dir = tmp_dir / _dirs[0]
df_mask_case = df_masks[df_masks['T2'].str.contains(case_id)]
assert len(df_mask_case) == 1
t2_mask_file = df_mask_case.iloc[0]["T2"]
assert f"{case_id}" in t2_mask_file
t2_series_id = int(t2_mask_file.rsplit(".", 2)[0].rsplit('_', 1)[1])
adc_mask_file = df_mask_case.iloc[0]["ADC"]
assert f"{case_id}" in adc_mask_file
if case_id == "ProstateX-0025":
# case 0025 has a 7a inside the table
adc_series_id = 7
assert adc_mask_file.endswith("7a.nii.gz")
elif case_id == "ProstateX-0113":
# even though the table shows 9 as the series
# ID we use 10 because 9 is not an ADC file?
adc_series_id = int(adc_mask_file.rsplit(".", 2)[0].rsplit('_', 1)[1])
assert adc_series_id == 9
adc_series_id = 10
else:
adc_series_id = int(adc_mask_file.rsplit(".", 2)[0].rsplit('_', 1)[1])
# T2
t2_dir = [f for f in data_dir.glob("*t2*") if f.name.startswith(f"{t2_series_id}.")]
assert len(t2_dir) == 1
t2_data_itk = load_dicom_series_sitk(t2_dir[0])
# ADC
adc_dir = [f for f in data_dir.glob("*ADC*") if f.name.startswith(f"{adc_series_id}.")]
assert len(adc_dir) == 1
adc_data_itk = load_dicom_series_sitk(adc_dir[0])
# PD-W
pdw_dir = sorted(data_dir.glob("* PD *"))[-1]
pdw_data_itk = load_dicom_series_sitk(pdw_dir)
# k-trans
ktrans_dir = ktrans_dirs / case_id
ktrans_data_itk = load_sitk(ktrans_dir / f"{case_id}-Ktrans.mhd")
# resample data to t2 (only early fusion is currently supported)
resampler = sitk.ResampleImageFilter() # default linear
resampler.SetReferenceImage(t2_data_itk)
adc_data_itk_res = resampler.Execute(adc_data_itk)
pdw_data_itk_res = resampler.Execute(pdw_data_itk)
ktrans_data_itk_res = resampler.Execute(ktrans_data_itk)
# prepare mask
mask_paths = list(t2_masks.glob(f"{case_id}*"))
fids = [int([l for l in mp.name.split("-") if "Finding" in l][0][7:]) for mp in mask_paths]
mask_itk = load_sitk(str(mask_paths[0]))
mask = sitk.GetArrayFromImage(mask_itk)
mask[mask > 0] = 1
for idx, mp in enumerate(mask_paths[1:], start=2):
_mask = load_sitk_as_array(str(mp))[0]
mask[_mask > 0] = idx
mask_final = sitk.GetImageFromArray(mask)
copy_meta_data_itk(t2_data_itk, mask_final)
df_case = df_labels.loc[df_labels['ProxID'] == case_id]
instances = {}
for row in df_case.itertuples():
if row.fid in fids:
instances[fids.index(int(row.fid)) + 1] = int(row.ClinSig)
else:
logger.info(f"Found removed fid {row.fid} in {case_id}")
# save
sitk.WriteImage(t2_data_itk, str(data_target / f"{case_id}_0000.nii.gz"))
sitk.WriteImage(adc_data_itk_res, str(data_target / f"{case_id}_0001.nii.gz"))
sitk.WriteImage(pdw_data_itk_res, str(data_target / f"{case_id}_0002.nii.gz"))
sitk.WriteImage(ktrans_data_itk_res, str(data_target / f"{case_id}_0003.nii.gz"))
sitk.WriteImage(mask_final, str(label_target / f"{case_id}.nii.gz"))
save_json({"instances": instances}, label_target / f"{case_id}.json")
except Exception as e:
logger.error(f"Case {case_id} failed with {e} and {traceback.format_exc()}")
@env_guard
def main():
"""
Does not use the KTrans Sequence of ProstateX
This script only uses the provided T2 masks
"""
det_data_dir = Path(os.getenv('det_data'))
task_data_dir = det_data_dir / "Task021_ProstateX"
# setup raw paths
source_data_dir = task_data_dir / "raw"
if not source_data_dir.is_dir():
raise RuntimeError(f"{source_data_dir} should contain the raw data but does not exist.")
source_data = source_data_dir / "PROSTATEx"
source_masks = source_data_dir / "rcuocolo-PROSTATEx_masks-e344452"
source_ktrans = source_data_dir / "ktrains"
csv_labels = source_data_dir / "ProstateX-TrainingLesionInformationv2" / "ProstateX-Findings-Train.csv"
csv_masks = source_data_dir / "rcuocolo-PROSTATEx_masks-e344452" / "Files" / "Image_list.csv"
data_target = task_data_dir / "raw_splitted" / "imagesTr"
data_target.mkdir(parents=True, exist_ok=True)
label_target = task_data_dir / "raw_splitted" / "labelsTr"
label_target.mkdir(parents=True, exist_ok=True)
logger.remove()
logger.add(sys.stdout, format="{level} {message}", level="INFO")
logger.add(data_target.parent.parent / "prepare.log", level="DEBUG")
base_masks = source_masks / "Files" / "Masks"
t2_masks = base_masks / "T2"
df_labels = pd.read_csv(csv_labels)
df_masks = pd.read_csv(csv_masks)
case_ids = [f.stem.split("-", 2)[:2] for f in t2_masks.glob("*nii.gz")]
case_ids = list(set([f"{c[0]}-{c[1]}" for c in case_ids]))
logger.info(f"Found {len(case_ids)} cases")
# save meta
logger.info("Saving dataset info")
dataset_info = {
"name": "ProstateX",
"task": "Task021_ProstateX",
"target_class": None,
"test_labels": False,
"labels": {
"0": "clinically_significant",
"1": "clinically_insignificant",
},
"modalities": {
"0": "T2",
"1": "ADC",
"2": "PD-W",
"3": "Ktrans"
},
"dim": 3,
"info": "Ground Truth: T2 Masks; \n"
"Modalities: T2, ADC, PD-W, Ktrans \n;"
"Classes: clinically significant = 1, insignificant = 0 \n"
"Keep: ProstateX-0025 '10-28-2011-MR prostaat kanker detectie WDSmc MCAPRODETW-19047'\n"
"Masks\n"
"https://github.com/rcuocolo/PROSTATEx_masks\n"
"Github hash: e3444521e70cd5e8d405f4e9a6bc08312df8afe7"
}
save_json(dataset_info, task_data_dir / "dataset.json")
# prepare labels and data
for cid in maybe_verbose_iterable(case_ids):
prepare_case(cid,
data_dirs=source_data,
ktrans_dirs=source_ktrans,
t2_masks=t2_masks,
df_labels=df_labels,
df_masks=df_masks,
data_target=data_target,
label_target=label_target,
)
# with Pool(processes=6) as p:
# p.starmap(prepare_case, zip(case_ids,
# repeat(source_data),
# repeat(source_ktrans),
# repeat(t2_masks),
# repeat(df_labels),
# repeat(df_masks),
# repeat(data_target),
# repeat(label_target),
# ))
# create test split
create_test_split(task_data_dir / "raw_splitted",
num_modalities=len(dataset_info["modalities"]),
test_size=0.3,
random_state=0,
shuffle=True,
)
if __name__ == '__main__':
main()
# LymphNodes
**Disclaimer**: We are not the host of the data.
Please make sure to read the requirements and usage policies of the data and **give credit to the authors of the dataset**!
Please read the information from the homepage carefully and follow the rules and instructions provided by the original authors when using the data.
- Homepage: https://wiki.cancerimagingarchive.net/display/Public/CT+Lymph+Nodes
- Masks: we used the masks provided by the same page
## Setup
0. Follow the installation instructions of nnDetection and create a data directory name `Task025_LymphNodes`.
1. Down the data and labels and place the data into `Task025_LymphNodes / raw / CT Lymph Nodes` and the labels into `Task025_LymphNodes / raw / MED_ABD_LYMPH_MASKS`
2. Run `python prepare.py` in `projects / Task025_LymphNodes / scripts` of the nnDetection repository.
The data is now converted to the correct format and the instructions from the nnDetection README can be used to train the networks.
import os
import shutil
import sys
from itertools import repeat
from multiprocessing import Pool
from pathlib import Path
from nndet.utils.check import env_guard
import numpy as np
from loguru import logger
import SimpleITK as sitk
from nndet.io import save_json
from nndet.io.prepare import create_test_split
from nndet.io.itk import load_sitk_as_array
from nndet.utils.info import maybe_verbose_iterable
def prepare_image(
case_id: str,
base_dir: Path,
mask_dir: Path,
raw_splitted_dir: Path,
):
logger.info(f"Processing {case_id}")
root_data_dir = base_dir / case_id
patient_data_dir = []
for root, dirs, files in os.walk(root_data_dir, topdown=False):
if any([f.endswith(".dcm") for f in files]):
patient_data_dir.append(Path(root))
assert len(patient_data_dir) == 1
patient_data_dir = patient_data_dir[0]
reader = sitk.ImageSeriesReader()
dicom_names = reader.GetGDCMSeriesFileNames(str(patient_data_dir))
reader.SetFileNames(dicom_names)
data_itk = reader.Execute()
patient_label_dir = mask_dir / case_id
label_path = [p for p in patient_label_dir.iterdir() if p.is_file() and p.name.endswith(".nii.gz")]
assert len(label_path) == 1
label_path = label_path[0]
mask = load_sitk_as_array(label_path)[0]
instances = np.unique(mask)
instances = instances[instances > 0]
meta = {"instances": {str(int(i)): 0 for i in instances}}
meta["original_path_data"] = str(patient_data_dir)
meta["original_path_label"] = str(label_path)
save_json(meta, raw_splitted_dir / "labelsTr" / f"{case_id}.json")
sitk.WriteImage(data_itk, str(raw_splitted_dir / "imagesTr" / f"{case_id}_0000.nii.gz"))
shutil.copy(label_path, raw_splitted_dir / "labelsTr" / f"{case_id}.nii.gz")
@env_guard
def main():
det_data_dir = Path(os.getenv("det_data"))
task_data_dir = det_data_dir / "Task025_LymphNodes"
source_data_base = task_data_dir / "raw"
if not source_data_base.is_dir():
raise RuntimeError(f"{source_data_base} should contain the raw data but does not exist.")
raw_splitted_dir = task_data_dir / "raw_splitted"
(raw_splitted_dir / "imagesTr").mkdir(parents=True, exist_ok=True)
(raw_splitted_dir / "labelsTr").mkdir(parents=True, exist_ok=True)
(raw_splitted_dir / "imagesTs").mkdir(parents=True, exist_ok=True)
(raw_splitted_dir / "labelsTs").mkdir(parents=True, exist_ok=True)
logger.remove()
logger.add(sys.stdout, format="{level} {message}", level="DEBUG")
logger.add(raw_splitted_dir.parent / "prepare.log", level="DEBUG")
meta = {
"name": "Lymph Node TCIA",
"task": "Task025_LymphNodes",
"target_class": None,
"test_labels": True,
"labels": {
"0": "LymphNode",
},
"modalities": {
"0": "CT",
},
"dim": 3,
}
save_json(meta, raw_splitted_dir.parent / "dataset.json")
base_dir = source_data_base / "CT Lymph Nodes"
mask_dir = source_data_base / "MED_ABD_LYMPH_MASKS"
case_ids = sorted([p.name for p in base_dir.iterdir() if p.is_dir()])
logger.info(f"Found {len(case_ids)} cases in {base_dir}")
for cid in maybe_verbose_iterable(case_ids):
prepare_image(
case_id=cid,
base_dir=base_dir,
mask_dir=mask_dir,
raw_splitted_dir=raw_splitted_dir,
)
# with Pool(processes=6) as p:
# p.starmap(
# prepare_image,
# zip(
# case_ids,
# repeat(base_dir),
# repeat(mask_dir),
# repeat(raw_splitted_dir)
# )
# )
create_test_split(raw_splitted_dir,
num_modalities=len(meta["modalities"]),
test_size=0.3,
random_state=0,
shuffle=True,
)
if __name__ == '__main__':
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment