Commit 95642b36 authored by mibaumgartner's avatar mibaumgartner
Browse files

LIDC, Luna, Decathlon Guide

parent a3d8812a
......@@ -152,7 +152,7 @@ Furthermore, we provide pretrained models which can be used without investing la
| <!-- --> | <!-- --> | <!-- --> |
|:--------:|:--------:|:--------:|
| | [nnDetection V0.1](/docs/results/nnDetectionV001.md) | |
| | [nnDetection v0.1](/docs/results/nnDetectionV001.md) | |
</div>
......
# Decathlon
**Disclaimer**: We are not the host of the data.
Please make sure to read the requirements and usage policies of the data and **give credit to the authors of the dataset**!
Please read the information from the homepage carefully and follow the rules and instructions provided by the original authors when using the data.
- Homepage: http://medicaldecathlon.com/
## Setup
0. Follow the installation instructions of nnDetection and create the data directories for the intended tasks, e.g. `Task003_Liver`.
1. Follow the instructions and usage policies to download the data and place the images, labels and dataset.json files inside the raw folder of the respective tasks, e.g. imagesTr -> `Task003_Liver / raw / imagesTr`, labelsTr -> `Task003_Liver / raw / labelsTr` and dataset.json -> `Task003_Liver / raw / dataset.json`
2. Run `python prepare.py [tasks]` in `projects / Task001_Decathlion / scripts` of the nnDetection repository, e.g. to prepare all tasks: `python prepare.py Task003_Liver Task007_Pancreas Task008_HepaticVessel Task010_Colon`
3. Run `nndet_seg2det [tasks]` to convert the semantic segmentation labels to instance segmentations, e.g. to convert all tasks `nndet_seg2det 003 007 008 010`
4. Run ... to download and replace the manually corrected labels. # TODO
The data is now converted to the correct format and the instructions from the nnDetection README can be used to train the networks.
import argparse
import os
import shutil
import sys
from itertools import repeat
from multiprocessing import Pool, Value
from pathlib import Path
from loguru import logger
from nndet.io.load import save_json
from nndet.io.prepare import maybe_split_4d_nifti, create_test_split
from nndet.io import get_case_ids_from_dir, load_json, save_yaml
from nndet.utils.check import env_guard
def process_case(case_id,
source_images,
source_labels,
target_images,
target_labels,
):
logger.info(f"Processing case {case_id}")
maybe_split_4d_nifti(source_images / f"{case_id}.nii.gz", target_images)
shutil.copy2(source_labels / f"{case_id}.nii.gz", target_labels)
@env_guard
def main():
parser = argparse.ArgumentParser()
parser.add_argument('tasks', type=str, nargs='+',
help="One or multiple of: Task003_Liver, Task007_Pancreas, "
"Task008_HepaticVessel, Task010_Colon",
)
args = parser.parse_args()
tasks = args.tasks
decathlon_props = {
"Task003_Liver": {
"seg2det_stuff": [1, ], # liver
"seg2det_things": [2, ], # cancer
"min_size": 3.,
"labels": {"0": "cancer"},
"labels_stuff": {"1": "liver"},
},
"Task007_Pancreas": {
"seg2det_stuff": [1, ], # pancreas
"seg2det_things": [2, ],
"min_size": 3.,
"labels": {"0": "cancer"},
"labels_stuff": {"1": "pancreas"},
},
"Task008_HepaticVessel": {
"seg2det_stuff": [1, ], # vessel
"seg2det_things": [2, ],
"min_size": 3.,
"labels": {"0": "tumour"},
"labels_stuff": {"1": "vessel"},
},
"Task010_Colon": {
"seg2det_stuff": [],
"seg2det_things": [1, ],
"min_size": 3.,
"labels": {"0": "cancer"},
"labels_stuff": {},
},
}
basedir = Path(os.getenv('det_data'))
for task in tasks:
task_data_dir = basedir / task
logger.remove()
logger.add(sys.stdout, level="INFO")
logger.add(task_data_dir / "prepare.log", level="DEBUG")
logger.info(f"Preparing task: {task}")
source_raw_dir = task_data_dir / "raw"
source_data_dir = source_raw_dir / "imagesTr"
source_labels_dir = source_raw_dir / "labelsTr"
splitted_dir = task_data_dir / "raw_splitted"
if not source_data_dir.is_dir():
raise ValueError(f"Exptected training images at {source_data_dir}")
if not source_labels_dir.is_dir():
raise ValueError(f"Exptected training labels at {source_labels_dir}")
if not (p := source_raw_dir / "dataset.json").is_file():
raise ValueError(f"Expected dataset json to be located at {p}")
target_data_dir = splitted_dir / "imagesTr"
target_label_dir = splitted_dir / "labelsTr"
target_data_dir.mkdir(parents=True, exist_ok=True)
target_label_dir.mkdir(parents=True, exist_ok=True)
# preapre meta
original_meta = load_json(source_raw_dir / "dataset.json")
dataset_info = {
"task": task,
"name": original_meta["name"],
"target_class": None,
"test_labels": True,
"modalities": original_meta["modality"],
"dim": 3,
"info": {
"original_labels": original_meta["labels"],
"original_numTraining": original_meta["numTraining"],
},
}
dataset_info.update(decathlon_props[task])
save_json(dataset_info, task_data_dir / "dataset.json")
# prepare data and labels
case_ids = get_case_ids_from_dir(source_data_dir, remove_modality=False)
case_ids = sorted([c for c in case_ids if c])
logger.info(f"Found {len(case_ids)} for preparation.")
for cid in case_ids:
process_case(cid,
source_data_dir,
source_labels_dir,
target_data_dir,
target_label_dir,
)
# with Pool(processes=6) as p:
# p.starmap(process_case, zip(case_ids,
# repeat(source_images),
# repeat(source_labels),
# repeat(target_images),
# repeat(target_labels),
# ))
# create an artificial test split
create_test_split(splitted_dir=splitted_dir,
num_modalities=1,
test_size=0.3,
random_state=0,
shuffle=True,
)
if __name__ == '__main__':
main()
......@@ -10,6 +10,6 @@ Please read the information from the homepage carefully and follow the rules and
1. Follow the instructions and usage policies to download the data and place all the folders which contain the data and labels for each case into `Task011_Kits / raw`
2. Run `python prepare.py` in `projects / Task011_Kits / scripts` of the nnDetection repository.
3. Run `nndet_seg2det 011` to convert the semantic segmentation labels to instance segmentations.
4. Run ... to download and replace the manually corrected labels.
4. Run ... to download and replace the manually corrected labels. # TODO
The data is now converted to the correct format and the instructions from the nnDetection README can be used to train the networks.
# LIDC
**Disclaimer**: We are not the host of the data.
Please make sure to read the requirements and usage policies of the data and **give credit to the authors of the dataset**!
Please read the information from the homepage carefully and follow the rules and instructions provided by the original authors when using the data.
- Homepage: https://wiki.cancerimagingarchive.net/display/Public/LIDC-IDRI
## Setup MIC LIDC Data preprocessing
0. Follow https://github.com/MIC-DKFZ/LIDC-IDRI-processing to convert the LIDC data into a simpler format.
1. Follow the installation instructions of nnDetection and create a data directory name `Task012_LIDC`.
2. Place the `data_nrrd` folder and `characteristics.csv` into `Task012_LIDC / raw`
3. Run `python prepare_mic.py` in `projects / Task012_LIDC / scripts` of the nnDetection repository.
The data is now converted to the correct format and the instructions from the nnDetection README can be used to train the networks.
## Setup PyLIDC
**Coming Soon**
import sys
import os
from itertools import repeat
from multiprocessing.pool import Pool
import pandas as pd
import numpy as np
import numpy.testing as npt
import SimpleITK as sitk
from pathlib import Path
from loguru import logger
from tqdm import tqdm
from pathlib import Path
from nndet.io.load import save_json, load_json
from nndet.io.paths import subfiles
from nndet.utils.check import env_guard
def prepare_case(case_dir: Path, target_dir: Path, df: pd.DataFrame):
target_data_dir = target_dir / "imagesTr"
target_label_dir = target_dir / "labelsTr"
case_id = str(case_dir).split('/')[-1]
logger.info(f"Processing case {case_id}")
df = df[df.PatientID == case_id]
# process data
img = sitk.ReadImage(str(case_dir / f"{case_id}_ct_scan.nrrd"))
sitk.WriteImage(img, str(target_data_dir / f"{case_id}.nii.gz"))
img_arr = sitk.GetArrayFromImage(img)
# process mask
final_rois = np.zeros_like(img_arr, dtype=np.uint8)
mal_labels = {}
roi_ids = set([ii.split('.')[0].split('_')[-1]
for ii in os.listdir(case_dir) if '.nii.gz' in ii])
rix = 1
for rid in roi_ids:
roi_id_paths = [ii for ii in os.listdir(case_dir) if '{}.nii'.format(rid) in ii]
nodule_ids = [ii.split('_')[2].lstrip("0") for ii in roi_id_paths]
rater_labels = [df[df.NoduleID == int(ii)].Malignancy.values[0] for ii in nodule_ids]
rater_labels.extend([0] * (4-len(rater_labels)))
mal_label = np.mean([ii for ii in rater_labels if ii > -1])
roi_rater_list = []
for rp in roi_id_paths:
roi = sitk.ReadImage(str(case_dir / rp))
roi_arr = sitk.GetArrayFromImage(roi).astype(np.uint8)
assert roi_arr.shape == img_arr.shape, [
roi_arr.shape, img_arr.shape, case_id, roi.GetSpacing()]
for ix in range(len(img_arr.shape)):
npt.assert_almost_equal(roi.GetSpacing()[ix], img.GetSpacing()[ix])
roi_rater_list.append(roi_arr)
roi_rater_list.extend([np.zeros_like(roi_rater_list[-1])]*(4-len(roi_id_paths)))
roi_raters = np.array(roi_rater_list)
roi_raters = np.mean(roi_raters, axis=0)
roi_raters[roi_raters < 0.5] = 0
if np.sum(roi_raters) > 0:
mal_labels[rix] = mal_label
final_rois[roi_raters >= 0.5] = rix
rix += 1
else:
# indicate rois suppressed by majority voting of raters
logger.warning(f'suppressed roi! {roi_id_paths}')
mask_itk = sitk.GetImageFromArray(final_rois)
sitk.WriteImage(mask_itk, str(target_label_dir / f"{case_id}.nii.gz"))
instance_classes = {key: int(item >= 3) for key, item in mal_labels}
save_json({"instances": instance_classes, "scores": mal_labels},
target_label_dir / f"{case_id}")
def reformat_labels(target: Path):
for p in subfiles(target, identifier="*json", join=True):
label = load_json(Path(p))
mal_labels = label["scores"]
instance_classes = {key: int(item >= 3) for key, item in mal_labels.items()}
save_json({"instances": instance_classes, "scores": mal_labels}, Path(p))
def delete_without_label(target: Path):
for p in subfiles(target, identifier="*.npz", join=True):
_p = str(p).rsplit('.', 1)[0] + '.pkl'
if not os.path.isfile(_p):
os.remove(p)
def check_data_load(target: Path):
for p in tqdm(subfiles(target, identifier="*.npy", join=True)):
try:
data = np.load(p)
except Exception as e:
print(f"Failed to load: {p} with {e}")
@env_guard
def main():
det_data_dir = Path(os.getenv('det_data'))
task_data_dir = det_data_dir / "Task012_LIDC"
source_data_dir = task_data_dir / "raw"
if not (p := source_data_dir / "data_nrrd").is_dir():
raise ValueError(f"Expted {p} to contain LIDC data")
if not (p := source_data_dir / 'characteristics.csv').is_file():
raise ValueError(f"Expted {p} to contain exist")
target_dir = task_data_dir / "raw_splitted"
target_data_dir = task_data_dir / "raw_splitted" / "imagesTr"
target_data_dir.mkdir(exist_ok=True, parents=True)
target_label_dir = task_data_dir / "raw_splitted" / "labelsTr"
target_label_dir.mkdir(exist_ok=True, parents=True)
logger.remove()
logger.add(sys.stdout, level="INFO")
logger.add(task_data_dir / "prepare.log", level="DEBUG")
data_dir = source_data_dir / "data_nrrd"
case_dirs = [x for x in data_dir.iterdir() if x.is_dir()]
df = pd.read_csv(source_data_dir / 'characteristics.csv', sep=';')
for cd in case_dirs:
prepare_case(cd, target_dir, df)
# TODO download custom split file
if __name__ == '__main__':
main()
# Luna16
**Disclaimer**: We are not the host of the data.
Please make sure to read the requirements and usage policies of the data and **give credit to the authors of the dataset**!
Please read the information from the homepage carefully and follow the rules and instructions provided by the original authors when using the data.
- Homepage: https://luna16.grand-challenge.org/Home/
## Setup
0. Follow the installation instructions of nnDetection and create a data directory name `Task016_Luna`.
1. Follow the instructions and usage policies to download the data and place all the subsets into `Task016_Luna / raw`
2. Run `python prepare.py` in `projects / Task016_Luna / scripts` of the nnDetection repository.
The data is now converted to the correct format and the instructions from the nnDetection README can be used to train the networks.
Notes:
- since Luna is a 10 Fold cross validation, all 10 folds need to be run
- all runs should be run with the `--sweep` option and consolidation should be performed via the `--no_model -c copy` since we are not planning to predict a separate test set.
## Evaluation
1. Run `python prepare_eval_cpm.py [model_name]` to convert the predictions to the Luna format.
Note: The script needs access to the raw_splitted images.
2. Download and run the luna evaluation script.
import os
import sys
import traceback
from collections import defaultdict
from itertools import repeat
from multiprocessing.pool import Pool
from SimpleITK.SimpleITK import ValuedRegionalMaxima
import pandas as pd
import SimpleITK as sitk
from pathlib import Path
from nndet.io.prepare import create_test_split
from loguru import logger
from nndet.io.itk import create_circle_mask_itk
from nndet.io.load import save_pickle, save_json, save_yaml, load_json
from nndet.utils.check import env_guard
def create_masks(source: Path, target: Path, df: pd.DataFrame):
files = []
split = {}
for i in range(10):
subset_dir = source / f"subset{i}"
if not subset_dir.is_dir():
logger.error(f"{subset_dir} is not s valid subset directory!")
continue
tmp = list((subset_dir.glob('*.mhd')))
files.extend(tmp)
for t in tmp:
split[t.stem.replace('.', '_')] = i
save_json(split, target.parent.parent / "splits.json")
centers = []
rads = []
for f in files:
c = []
r = []
try:
series_df = df.loc[{f.name.rsplit('.', 1)[0]}]
except KeyError:
pass
else:
for _, row in series_df.iterrows():
c.append((float(row['coordX']), float(row['coordY']), float(row['coordZ'])))
r.append(float(row['diameter_mm']) / 2)
centers.append(c)
rads.append(r)
assert len(files) == len(centers) == len(rads)
with Pool(processes=6) as p:
p.starmap(_create_mask, zip(files, repeat(target), centers, rads))
# for t in zip(files, repeat(target), centers, rads):
# _create_mask(*t)
def _create_mask(source, target, centers, rads):
try:
logger.info(f"Processing {source.stem}")
data = sitk.ReadImage(str(source))
mask = create_circle_mask_itk(data, centers, rads, ndim=3)
sitk.WriteImage(mask, str(target / f"{source.stem.replace('.', '_')}.nii.gz"))
save_json({"instances": {str(k + 1): 0 for k in range(len(centers))}},
target / f"{source.stem.replace('.', '_')}.json")
except Exception as e:
logger.error(f"Case {source.stem} failed with {e} and {traceback.format_exc()}")
def create_splits(source, target):
files = []
for p in source.glob('subset*'):
path = Path(p)
if not p.is_dir():
continue
_files = [str(i).rsplit('.', 1)[0] for i in path.iterdir() if i.suffix == ".mhd"]
files.append(_files)
splits = []
for i in range(len(files)):
train_ids = list(range(len(files)))
test = files[i]
train_ids.pop(i)
val = files[(i + 1) % len(files)]
train_ids.pop((i + 1) % len(files))
assert len(train_ids) == len(files) - 2
train = [tr for tri in train_ids for tr in files[tri]]
splits.append({"train": train, "val": val, "test": test})
save_pickle(splits, target)
def convert_data(source: Path, target: Path):
for subset_dir in source.glob('subset*'):
subset_dir = Path(subset_dir)
if not subset_dir.is_dir():
continue
with Pool(processes=6) as p:
p.starmap(_convert_data, zip(subset_dir.glob('*.mhd'), repeat(target)))
def _convert_data(f, target):
logger.info(f"Converting {f}")
try:
data = sitk.ReadImage(str(f))
sitk.WriteImage(data, str(target / f"{f.stem.replace('.', '_')}_0000.nii.gz"))
except Exception as e:
logger.error(f"Case {f} failed with {e} and {traceback.format_exc()}")
@env_guard
def main():
det_data_dir = Path(os.getenv('det_data'))
task_data_dir = det_data_dir / "Task016_Luna"
source_data_dir = task_data_dir / "raw"
if not source_data_dir.is_dir():
raise RuntimeError(f"{source_data_dir} should contain the raw data but does not exist.")
for i in range(10):
if not (p := source_data_dir / f"subset{i}"):
raise ValueError(f"Expected {p} to contain Luna data")
if not (p := source_data_dir / "annotations.csv").is_file():
raise ValueError(f"Exptected {p} to exist.")
target_data_dir = task_data_dir / "raw_splitted" / "imagesTr"
target_data_dir.mkdir(exist_ok=True, parents=True)
target_label_dir = task_data_dir / "raw_splitted" / "labelsTr"
target_label_dir.mkdir(exist_ok=True, parents=True)
target_preprocessed_dir = task_data_dir / "preprocessed"
target_preprocessed_dir.mkdir(exist_ok=True)
logger.remove()
logger.add(sys.stdout, level="INFO")
logger.add(task_data_dir / "prepare.log", level="DEBUG")
meta = {
"name": "Luna",
"task": "Task016_Luna",
"target_class": None,
"test_labels": False,
"labels": {
"0": "lesion",
},
"modalities": {
"0": "CT",
},
"dim": 3,
}
save_json(meta, task_data_dir / "dataset.json")
# prepare data and labels
csv = source_data_dir / "annotations.csv"
convert_data(source_data_dir, target_data_dir)
df = pd.read_csv(csv, index_col='seriesuid')
create_masks(source_data_dir, target_label_dir, df)
# generate split
logger.info("Generating luna splits... ")
saved_original_splits = load_json(task_data_dir / "splits.json")
logger.info(f"Found {len(list(saved_original_splits.keys()))} ids in splits.json")
original_fold_ids = defaultdict(list)
for cid, fid in saved_original_splits.items():
original_fold_ids[fid].append(cid)
splits = []
for test_fold in range(10):
all_folds = list(range(10))
all_folds.pop(test_fold)
train_ids = []
for af in all_folds:
train_ids.extend(original_fold_ids[af])
splits.append({
"train": train_ids,
"val": original_fold_ids[test_fold],
})
save_pickle(splits, target_preprocessed_dir / "splits_final.pkl")
save_json(splits, target_preprocessed_dir / "splits_final.json")
if __name__ == '__main__':
main()
import argparse
import os
import sys
from pathlib import Path
from collections import defaultdict
import pandas as pd
from loguru import logger
from tqdm import tqdm
from nndet.io.itk import load_sitk
from nndet.io.load import load_pickle
from nndet.core.boxes.ops_np import box_center_np
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('model', type=str, help="Name of model")
args = parser.parse_args()
model = args.model
task_dir = Path(os.getenv("det_models")) / "Task016_Luna"
model_dir = task_dir / model
assert model_dir.is_dir()
raw_splitted_images = Path(os.getenv("det_data")) / "Task016_Luna" / "raw_splitted" / "imagesTr"
prediction_dir = model_dir / "consolidated" / "val_predictions"
assert prediction_dir.is_dir()
logger.remove()
logger.add(sys.stdout, level="INFO")
log_file = model_dir / "prepare_eval_cpm.log"
prediction_cache = defaultdict(list)
prediction_paths = sorted([p for p in prediction_dir.iterdir() if p.is_file() and p.name.endswith("_boxes.pkl")])
logger.info(f"Found {len(prediction_paths)} predictions for evaluation")
for prediction_path in tqdm(prediction_paths):
seriusuid = prediction_path.stem.rsplit("_", 1)[0].replace('_', ".")
predictions = load_pickle(prediction_path)
data_path = raw_splitted_images / f"{prediction_path.stem.rsplit('_', 1)[0]}_0000.nii.gz"
image_itk = load_sitk(data_path)
boxes = predictions["pred_boxes"]
probs = predictions["pred_scores"]
centers = box_center_np(boxes)
assert predictions["restore"]
for center, prob in zip(centers, probs):
position_image = (float(center[2]), float(center[1]), float(center[0]))
position_world = image_itk.TransformContinuousIndexToPhysicalPoint(position_image)
prediction_cache["seriesuid"].append(seriusuid)
prediction_cache["coordX"].append(float(position_world[0]))
prediction_cache["coordY"].append(float(position_world[1]))
prediction_cache["coordZ"].append(float(position_world[2]))
prediction_cache["probability"].append(float(prob))
df = pd.DataFrame(prediction_cache)
df.to_csv(model_dir / f"{model}.csv")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment