Merge remote-tracking branch 'origin/master' into main

7246044d · mibaumgartner · fcec502f · 6f4c3333 · 7246044d · 7246044d
Commit 7246044d authored Jun 01, 2021 by mibaumgartner
20 changed files
--- a/projects/Task012_LIDC/scripts/prepare_mic.py
+++ b/projects/Task012_LIDC/scripts/prepare_mic.py
+import sys
+import os
+from itertools import repeat
+from multiprocessing.pool import Pool
+
+import pandas as pd
+import numpy as np
+import numpy.testing as npt
+import SimpleITK as sitk
+from pathlib import Path
+from loguru import logger
+from tqdm import tqdm
+from pathlib import Path
+
+from nndet.io.load import save_json, load_json
+from nndet.io.paths import subfiles
+from nndet.utils.check import env_guard
+from nndet.utils.info import maybe_verbose_iterable
+
+
+def prepare_case(case_dir: Path, target_dir: Path, df: pd.DataFrame):
+    target_data_dir = target_dir / "imagesTr"
+    target_label_dir = target_dir / "labelsTr"
+
+    case_id = str(case_dir).split('/')[-1]
+    logger.info(f"Processing case {case_id}")
+    df = df[df.PatientID == case_id]
+
+    # process data
+    img = sitk.ReadImage(str(case_dir / f"{case_id}_ct_scan.nrrd"))
+    sitk.WriteImage(img, str(target_data_dir / f"{case_id}.nii.gz"))
+    img_arr = sitk.GetArrayFromImage(img)
+
+    # process mask
+    final_rois = np.zeros_like(img_arr, dtype=np.uint8)
+    mal_labels = {}
+    roi_ids = set([ii.split('.')[0].split('_')[-1]
+                   for ii in os.listdir(case_dir) if '.nii.gz' in ii])
+
+    rix = 1
+    for rid in roi_ids:
+        roi_id_paths = [ii for ii in os.listdir(case_dir) if '{}.nii'.format(rid) in ii]
+        nodule_ids = [ii.split('_')[2].lstrip("0") for ii in roi_id_paths]
+        rater_labels = [df[df.NoduleID == int(ii)].Malignancy.values[0] for ii in nodule_ids]
+        rater_labels.extend([0] * (4-len(rater_labels)))
+        mal_label = np.mean([ii for ii in rater_labels if ii > -1])
+
+        roi_rater_list = []
+        for rp in roi_id_paths:
+            roi = sitk.ReadImage(str(case_dir / rp))
+            roi_arr = sitk.GetArrayFromImage(roi).astype(np.uint8)
+            assert roi_arr.shape == img_arr.shape, [
+                roi_arr.shape, img_arr.shape, case_id, roi.GetSpacing()]
+            for ix in range(len(img_arr.shape)):
+                npt.assert_almost_equal(roi.GetSpacing()[ix], img.GetSpacing()[ix])
+            roi_rater_list.append(roi_arr)
+
+        roi_rater_list.extend([np.zeros_like(roi_rater_list[-1])]*(4-len(roi_id_paths)))
+        roi_raters = np.array(roi_rater_list)
+        roi_raters = np.mean(roi_raters, axis=0)
+        roi_raters[roi_raters < 0.5] = 0
+        if np.sum(roi_raters) > 0:
+            mal_labels[rix] = mal_label
+            final_rois[roi_raters >= 0.5] = rix
+            rix += 1
+        else:
+            # indicate rois suppressed by majority voting of raters
+            logger.warning(f'suppressed roi! {roi_id_paths}')
+
+    mask_itk = sitk.GetImageFromArray(final_rois)
+    sitk.WriteImage(mask_itk, str(target_label_dir / f"{case_id}.nii.gz"))
+    instance_classes = {key: int(item >= 3) for key, item in mal_labels}
+    save_json({"instances": instance_classes, "scores": mal_labels},
+              target_label_dir / f"{case_id}")
+
+
+def reformat_labels(target: Path):
+    for p in subfiles(target, identifier="*json", join=True):
+        label = load_json(Path(p))
+        mal_labels = label["scores"]
+        instance_classes = {key: int(item >= 3) for key, item in mal_labels.items()}
+        save_json({"instances": instance_classes, "scores": mal_labels}, Path(p))
+
+
+def delete_without_label(target: Path):
+    for p in subfiles(target, identifier="*.npz", join=True):
+        _p = str(p).rsplit('.', 1)[0] + '.pkl'
+        if not os.path.isfile(_p):
+            os.remove(p)
+
+
+def check_data_load(target: Path):
+    for p in tqdm(subfiles(target, identifier="*.npy", join=True)):
+        try:
+            data = np.load(p)
+        except Exception as e:
+            print(f"Failed to load: {p} with {e}")
+
+
+@env_guard
+def main():
+    det_data_dir = Path(os.getenv('det_data'))
+    task_data_dir = det_data_dir / "Task012_LIDC"
+    source_data_dir = task_data_dir / "raw"
+    
+    if not (p := source_data_dir / "data_nrrd").is_dir():
+        raise ValueError(f"Expted {p} to contain LIDC data")
+    if not (p := source_data_dir / 'characteristics.csv').is_file():
+        raise ValueError(f"Expted {p} to contain exist")
+
+    target_dir = task_data_dir / "raw_splitted"
+    target_data_dir = task_data_dir / "raw_splitted" / "imagesTr"
+    target_data_dir.mkdir(exist_ok=True, parents=True)
+    target_label_dir = task_data_dir / "raw_splitted" / "labelsTr"
+    target_label_dir.mkdir(exist_ok=True, parents=True)
+
+    logger.remove()
+    logger.add(sys.stdout, level="INFO")
+    logger.add(task_data_dir / "prepare.log", level="DEBUG")
+
+    data_dir = source_data_dir / "data_nrrd"
+    case_dirs = [x for x in data_dir.iterdir() if x.is_dir()]
+    df = pd.read_csv(source_data_dir / 'characteristics.csv', sep=';')
+
+    for cd in maybe_verbose_iterable(case_dirs):
+        prepare_case(cd, target_dir, df)
+
+    # TODO download custom split file
+
+
+if __name__ == '__main__':
+    main()
--- a/projects/Task012_LIDC/splits_final.pkl
+++ b/projects/Task012_LIDC/splits_final.pkl
--- a/projects/Task016_Luna/README.md
+++ b/projects/Task016_Luna/README.md
+# Luna16
+**Disclaimer**: We are not the host of the data.
+Please make sure to read the requirements and usage policies of the data and **give credit to the authors of the dataset**!
+
+Please read the information from the homepage carefully and follow the rules and instructions provided by the original authors when using the data.
+- Homepage: https://luna16.grand-challenge.org/Home/
+
+## Setup
+0. Follow the installation instructions of nnDetection and create a data directory name `Task016_Luna`.
+1. Follow the instructions and usage policies to download the data and place all the subsets into `Task016_Luna / raw`
+2. Run `python prepare.py` in `projects / Task016_Luna / scripts` of the nnDetection repository.
+
+The data is now converted to the correct format and the instructions from the nnDetection README can be used to train the networks.
+
+Notes:
+- since Luna is a 10 Fold cross validation, all 10 folds need to be run
+- all runs should be run with the `--sweep` option and consolidation should be performed via the `--no_model -c copy` since we are not planning to predict a separate test set.
+
+## Evaluation
+1. Run `python prepare_eval_cpm.py [model_name]` to convert the predictions to the Luna format.
+Note: The script needs access to the raw_splitted images.
+2. Download and run the luna evaluation script.
--- a/projects/Task016_Luna/scripts/prepare.py
+++ b/projects/Task016_Luna/scripts/prepare.py
+import argparse
+import os
+import sys
+import traceback
+from collections import defaultdict
+from itertools import repeat
+from multiprocessing.pool import Pool
+from SimpleITK.SimpleITK import ValuedRegionalMaxima
+
+import pandas as pd
+import SimpleITK as sitk
+from pathlib import Path
+
+from nndet.io.prepare import create_test_split
+from loguru import logger
+
+from nndet.io.itk import create_circle_mask_itk
+from nndet.io.load import save_pickle, save_json, save_yaml, load_json
+from nndet.utils.check import env_guard
+
+
+def create_masks(source: Path, target: Path, df: pd.DataFrame, num_processes: int):
+    files = []
+    split = {}
+    for i in range(10):
+        subset_dir = source / f"subset{i}"
+        if not subset_dir.is_dir():
+            logger.error(f"{subset_dir} is not s valid subset directory!")
+            continue
+
+        tmp = list((subset_dir.glob('*.mhd')))
+        files.extend(tmp)
+        for t in tmp:
+            split[t.stem.replace('.', '_')] = i
+    save_json(split, target.parent.parent / "splits.json")
+
+    centers = []
+    rads = []
+    for f in files:
+        c = []
+        r = []
+        try:
+            series_df = df.loc[{f.name.rsplit('.', 1)[0]}]
+        except KeyError:
+            pass
+        else:
+            for _, row in series_df.iterrows():
+                c.append((float(row['coordX']), float(row['coordY']), float(row['coordZ'])))
+                r.append(float(row['diameter_mm']) / 2)
+        centers.append(c)
+        rads.append(r)
+
+    assert len(files) == len(centers) == len(rads)
+    with Pool(processes=num_processes) as p:
+        p.starmap(_create_mask, zip(files, repeat(target), centers, rads))
+    # for t in zip(files, repeat(target), centers, rads):
+    #     _create_mask(*t)
+
+
+def _create_mask(source, target, centers, rads):
+    try:
+        logger.info(f"Processing {source.stem}")
+        data = sitk.ReadImage(str(source))
+        mask = create_circle_mask_itk(data, centers, rads, ndim=3)
+        sitk.WriteImage(mask, str(target / f"{source.stem.replace('.', '_')}.nii.gz"))
+        save_json({"instances": {str(k + 1): 0 for k in range(len(centers))}},
+                  target / f"{source.stem.replace('.', '_')}.json")
+    except Exception as e:
+        logger.error(f"Case {source.stem} failed with {e} and {traceback.format_exc()}")
+
+
+def create_splits(source, target):
+    files = []
+    for p in source.glob('subset*'):
+        path = Path(p)
+        if not p.is_dir():
+            continue
+        _files = [str(i).rsplit('.', 1)[0] for i in path.iterdir() if i.suffix == ".mhd"]
+        files.append(_files)
+    splits = []
+    for i in range(len(files)):
+        train_ids = list(range(len(files)))
+        test = files[i]
+        train_ids.pop(i)
+        val = files[(i + 1) % len(files)]
+        train_ids.pop((i + 1) % len(files))
+        assert len(train_ids) == len(files) - 2
+        train = [tr for tri in train_ids for tr in files[tri]]
+        splits.append({"train": train, "val": val, "test": test})
+    save_pickle(splits, target)
+
+
+def convert_data(source: Path, target: Path, num_processes: int):
+    for subset_dir in source.glob('subset*'):
+        subset_dir = Path(subset_dir)
+        if not subset_dir.is_dir():
+            continue
+
+        with Pool(processes=num_processes) as p:
+            p.starmap(_convert_data, zip(subset_dir.glob('*.mhd'), repeat(target)))
+
+
+def _convert_data(f, target):
+    logger.info(f"Converting {f}")
+    try:
+        data = sitk.ReadImage(str(f))
+        sitk.WriteImage(data, str(target / f"{f.stem.replace('.', '_')}_0000.nii.gz"))
+    except Exception as e:
+        logger.error(f"Case {f} failed with {e} and {traceback.format_exc()}")
+
+
+@env_guard
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--num_processes', type=int, default=4, required=False,
+                        help="Number of processes to use for preparation.")
+    args = parser.parse_args()
+    num_processes = args.num_processes
+
+    det_data_dir = Path(os.getenv('det_data'))
+    task_data_dir = det_data_dir / "Task016_Luna"
+    source_data_dir = task_data_dir / "raw"
+
+    if not source_data_dir.is_dir():
+        raise RuntimeError(f"{source_data_dir} should contain the raw data but does not exist.")
+    for i in range(10):
+        if not (p := source_data_dir / f"subset{i}"):
+            raise ValueError(f"Expected {p} to contain Luna data")
+    if not (p := source_data_dir / "annotations.csv").is_file():
+        raise ValueError(f"Exptected {p} to exist.")
+
+    target_data_dir = task_data_dir / "raw_splitted" / "imagesTr"
+    target_data_dir.mkdir(exist_ok=True, parents=True)
+    target_label_dir = task_data_dir / "raw_splitted" / "labelsTr"
+    target_label_dir.mkdir(exist_ok=True, parents=True)
+    target_preprocessed_dir = task_data_dir / "preprocessed"
+    target_preprocessed_dir.mkdir(exist_ok=True)
+
+    logger.remove()
+    logger.add(sys.stdout, level="INFO")
+    logger.add(task_data_dir / "prepare.log", level="DEBUG")
+
+    meta = {
+        "name": "Luna",
+        "task": "Task016_Luna",
+
+        "target_class": None,
+        "test_labels": False,
+        
+        "labels": {
+            "0": "lesion",
+        },
+        "modalities": {
+            "0": "CT",
+        },
+        "dim": 3,
+    }
+    save_json(meta, task_data_dir / "dataset.json")
+
+    # prepare data and labels
+    csv = source_data_dir / "annotations.csv"
+    convert_data(source_data_dir, target_data_dir, num_processes=num_processes)
+
+    df = pd.read_csv(csv, index_col='seriesuid')
+    create_masks(source_data_dir, target_label_dir, df, num_processes=num_processes)
+
+    # generate split
+    logger.info("Generating luna splits... ")
+    saved_original_splits = load_json(task_data_dir / "splits.json")
+    logger.info(f"Found {len(list(saved_original_splits.keys()))} ids in splits.json")
+    original_fold_ids = defaultdict(list)
+    for cid, fid in saved_original_splits.items():
+        original_fold_ids[fid].append(cid)
+
+    splits = []
+    for test_fold in range(10):
+        all_folds = list(range(10))
+        all_folds.pop(test_fold)
+
+        train_ids = []
+        for af in all_folds:
+            train_ids.extend(original_fold_ids[af])
+        splits.append({
+            "train": train_ids,
+            "val": original_fold_ids[test_fold],
+        })
+    save_pickle(splits, target_preprocessed_dir / "splits_final.pkl")
+    save_json(splits, target_preprocessed_dir / "splits_final.json")
+
+if __name__ == '__main__':
+    main()
--- a/projects/Task016_Luna/scripts/prepare_eval_cpm.py
+++ b/projects/Task016_Luna/scripts/prepare_eval_cpm.py
+import argparse
+import os
+import sys
+from pathlib import Path
+from collections import defaultdict
+
+import pandas as pd
+from loguru import logger
+from tqdm import tqdm
+
+from nndet.io.itk import load_sitk
+from nndet.io.load import load_pickle
+from nndet.core.boxes.ops_np import box_center_np
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('model', type=str, help="Name of model")
+    args = parser.parse_args()
+    model = args.model
+
+    task_dir = Path(os.getenv("det_models")) / "Task016_Luna"
+    model_dir = task_dir / model
+    assert model_dir.is_dir()
+
+    raw_splitted_images = Path(os.getenv("det_data")) / "Task016_Luna" / "raw_splitted" / "imagesTr"
+
+    prediction_dir = model_dir / "consolidated" / "val_predictions"
+    assert prediction_dir.is_dir()
+
+    logger.remove()
+    logger.add(sys.stdout, level="INFO")
+    log_file = model_dir / "prepare_eval_cpm.log"
+
+    prediction_cache = defaultdict(list)
+    prediction_paths = sorted([p for p in prediction_dir.iterdir() if p.is_file() and p.name.endswith("_boxes.pkl")])
+    logger.info(f"Found {len(prediction_paths)} predictions for evaluation")
+    for prediction_path in tqdm(prediction_paths):
+        seriusuid = prediction_path.stem.rsplit("_", 1)[0].replace('_', ".")
+        predictions = load_pickle(prediction_path)
+
+        data_path = raw_splitted_images / f"{prediction_path.stem.rsplit('_', 1)[0]}_0000.nii.gz"
+        image_itk = load_sitk(data_path)
+
+        boxes = predictions["pred_boxes"]
+        probs = predictions["pred_scores"]
+        centers = box_center_np(boxes)
+        assert predictions["restore"]
+
+        for center, prob in zip(centers, probs):
+            position_image = (float(center[2]), float(center[1]), float(center[0]))
+            position_world = image_itk.TransformContinuousIndexToPhysicalPoint(position_image)
+
+            prediction_cache["seriesuid"].append(seriusuid)
+            prediction_cache["coordX"].append(float(position_world[0]))
+            prediction_cache["coordY"].append(float(position_world[1]))
+            prediction_cache["coordZ"].append(float(position_world[2]))
+            prediction_cache["probability"].append(float(prob))
+
+    df = pd.DataFrame(prediction_cache)
+    df.to_csv(model_dir / f"{model}.csv")
--- a/projects/Task017_CADA/README.md
+++ b/projects/Task017_CADA/README.md
+# CADA
+**Disclaimer**: We are not the host of the data.
+Please make sure to read the requirements and usage policies of the data and **give credit to the authors of the dataset**!
+
+Please read the information from the homepage carefully and follow the rules and instructions provided by the original authors when using the data.
+- Homepage: https://cada.grand-challenge.org/Introduction/
+- Subtask: Task 1 aneurysm detection
+
+## Setup
+0. Follow the installation instructions of nnDetection and create a data directory name `Task017_CADA`.
+1. Follow the instructions and usage policies to download the data and place the data and labels at the following locations: data -> `Task017_CADA / raw / train_dataset` and labels -> `Task017_CADA / raw / train_mask_images`
+2. Run `python prepare.py` in `projects / Task017_CADA / scripts` of the nnDetection repository.
+
+The data is now converted to the correct format and the instructions from the nnDetection README can be used to train the networks.
--- a/projects/Task017_CADA/scripts/prepare.py
+++ b/projects/Task017_CADA/scripts/prepare.py
+import os
+import shutil
+from pathlib import Path
+
+import SimpleITK as sitk
+
+from nndet.io import save_json
+from nndet.utils.check import env_guard
+from nndet.utils.info import maybe_verbose_iterable
+
+
+def run_prep(source_data: Path, source_label: Path,
+             target_data_dir, target_label_dir: Path):
+    case_id = f"{(source_data.stem).rsplit('_', 1)[0]}"
+
+    shutil.copy(source_data, target_data_dir / f"{case_id}_0000.nii.gz")
+    shutil.copy(source_label, target_label_dir / f"{case_id}.nii.gz")  # rename label file to match data
+    label_itk = sitk.ReadImage(str(source_label))
+    
+    label_np = sitk.GetArrayFromImage(label_itk)
+    instances = {int(_id + 1): 0 for _id in range(label_np.max())}
+    save_json({"instances": instances}, target_label_dir / f"{case_id}")
+
+
+@env_guard
+def main():
+    det_data_dir = Path(os.getenv('det_data'))
+    task_data_dir = det_data_dir / "Task017_CADA"
+    
+    # setup raw paths
+    source_data_dir = task_data_dir / "raw" / "train_dataset"
+    if not source_data_dir.is_dir():
+        raise RuntimeError(f"{source_data_dir} should contain the raw data but does not exist.")
+    source_label_dir = task_data_dir / "raw" / "train_mask_images"
+    if not source_label_dir.is_dir():
+        raise RuntimeError(f"{source_label_dir} should contain the raw labels but does not exist.")
+
+    # setup raw splitted dirs
+    target_data_dir = task_data_dir / "raw_splitted" / "imagesTr"
+    target_data_dir.mkdir(exist_ok=True, parents=True)
+    target_label_dir = task_data_dir / "raw_splitted" / "labelsTr"
+    target_label_dir.mkdir(exist_ok=True, parents=True)
+
+    # prepare dataset info
+    meta = {
+        "name": "CADA",
+        "task": "Task017_CADA",
+        
+        "target_class": None,
+        "test_labels": False,
+        
+        "labels": {"0": "aneurysm"},
+        "modalities": {"0": "CT"},
+        "dim": 3,
+    }
+    save_json(meta, task_data_dir / "dataset.json")
+
+    # prepare data & label
+    case_ids = [(p.stem).rsplit('_', 1)[0] for p in source_data_dir.glob("*.nii.gz")]
+    print(f"Found {len(case_ids)} case ids")
+    for cid in maybe_verbose_iterable(case_ids):
+        run_prep(
+            source_data=source_data_dir / f"{cid}_orig.nii.gz",
+            source_label=source_label_dir / f"{cid}_labeledMasks.nii.gz",
+            target_data_dir=target_data_dir,
+            target_label_dir=target_label_dir,
+            )
+
+
+if __name__ == "__main__":
+    main()
--- a/projects/Task019_ADAM/README.md
+++ b/projects/Task019_ADAM/README.md
+# ADAM
+**Disclaimer**: We are not the host of the data.
+Please make sure to read the requirements and usage policies of the data and **give credit to the authors of the dataset**!
+
+Please read the information from the homepage carefully and follow the rules and instructions provided by the original authors when using the data.
+- Homepage: http://adam.isi.uu.nl/
+- Subtask: Task 1
+
+## Setup
+0. Follow the installation instructions of nnDetection and create a data directory name `Task019FG_ADAM`. We added FG to the ID to indicate that unruptered and ruptured aneurysms are treated as one i.e. we are running a foreground vs background detection without distinguishing the classes.
+1. Follow the instructions and usage policies to download the data and place the data into `Task019FG_ADAM / raw / ADAM_release_subjs`
+2. Run `python prepare.py` in `projects / Task019_ADAM / scripts` of the nnDetection repository.
+3. Run `python split.py` in `projects / Task019_ADAM / scripts` of the nnDetection repository.
+4. [Info]: The provided instructions will automatically create a patient stratified random split. We used a random split for our challenge submission. By renaming the provided split file in the `preprocessed` folders, nnDetection will automatically create a random split.
+
+The data is now converted to the correct format and the instructions from the nnDetection README can be used to train the networks.
+
--- a/projects/Task019_ADAM/scripts/prepare.py
+++ b/projects/Task019_ADAM/scripts/prepare.py
+import os
+import shutil
+from pathlib import Path
+
+from nndet.io import save_json
+from nndet.io.prepare import instances_from_segmentation
+from nndet.utils.check import env_guard
+from nndet.utils.info import maybe_verbose_iterable
+
+
+def run_prep_fg_v_bg(
+        case_id: str,
+        source_data: Path,
+        target_data_dir,
+        target_label_dir: Path,
+        struct="pre/struct_aligned.nii.gz",  # bias field corrected and aligned
+        tof="pre/TOF.nii.gz",  # tof image
+        ):
+    struct_path = source_data / case_id / struct
+    tof_path = source_data / case_id / tof
+    mask_path = source_data / case_id / "aneurysms.nii.gz"
+
+    shutil.copy(struct_path, target_data_dir / f"{case_id}_0000.nii.gz")
+    shutil.copy(tof_path, target_data_dir / f"{case_id}_0001.nii.gz")
+    instances_from_segmentation(mask_path,
+                                target_label_dir,
+                                fg_vs_bg=True,
+                                file_name=f"{case_id}",
+                                )
+
+
+@env_guard
+def main():
+    det_data_dir = Path(os.getenv('det_data'))
+    task_data_dir = det_data_dir / "Task019FG_ADAM"
+    
+    # setup raw paths
+    source_data_dir = task_data_dir / "raw" / "ADAM_release_subjs"
+    if not source_data_dir.is_dir():
+        raise RuntimeError(f"{source_data_dir} should contain the raw data but does not exist.")
+
+    # setup raw splitted dirs
+    target_data_dir = task_data_dir / "raw_splitted" / "imagesTr"
+    target_data_dir.mkdir(exist_ok=True, parents=True)
+    target_label_dir = task_data_dir / "raw_splitted" / "labelsTr"
+    target_label_dir.mkdir(exist_ok=True, parents=True)
+
+    # prepare dataset info
+    meta = {
+        "name": "ADAM",
+        "task": "Task019FG_ADAM",
+        "target_class": None,
+        "test_labels": False,
+        "labels": {"0": "Aneurysm"}, # since we are running FG vs BG this is not completely correct
+        "modalities": {"0": "Structured", "1": "TOF"},
+        "dim": 3,
+    }
+    save_json(meta, task_data_dir / "dataset.json")
+
+    # prepare data
+    case_ids = [p.stem for p in source_data_dir.iterdir() if p.is_dir()]
+    print(f"Found {len(case_ids)} case ids")
+    for cid in maybe_verbose_iterable(case_ids):
+        run_prep_fg_v_bg(
+            case_id=cid,
+            source_data=source_data_dir,
+            target_data_dir=target_data_dir,
+            target_label_dir=target_label_dir,
+            )
+
+
+if __name__ == "__main__":
+    main()
--- a/projects/Task019_ADAM/scripts/split.py
+++ b/projects/Task019_ADAM/scripts/split.py
+import os
+from collections import OrderedDict
+from pathlib import Path
+
+import numpy as np
+from sklearn.model_selection import GroupKFold
+
+from nndet.utils.check import env_guard
+from nndet.io import get_case_ids_from_dir, save_pickle
+
+
+@env_guard
+def main():
+    det_data_dir = Path(os.getenv('det_data'))
+    task_data_dir = det_data_dir / "Task019FG_ADAM"
+
+    target_label_dir = task_data_dir / "raw_splitted" / "labelsTr"
+    splits_file_dir = task_data_dir / "preprocessed"
+    splits_file_dir.mkdir(parents=True, exist_ok=True)
+    splits_file = splits_file_dir / "splits_final.pkl"
+
+    case_ids = sorted(get_case_ids_from_dir(target_label_dir, remove_modality=False))
+    case_ids_pat = [c if c.isdigit() else c[:-1] for c in case_ids]
+    case_ids_pat_unique = list(set(case_ids_pat))
+    print(f"Found {len(case_ids_pat_unique)} unique patient ids.")
+
+    splits = []
+    kfold = GroupKFold(n_splits=5)
+    for i, (train_idx, test_idx) in enumerate(kfold.split(case_ids, groups=case_ids_pat)):
+        train_keys = np.array(case_ids)[train_idx]
+        test_keys = np.array(case_ids)[test_idx]
+
+        splits.append(OrderedDict())
+        splits[-1]['train'] = train_keys
+        splits[-1]['val'] = test_keys
+        print(f"Generated split: {splits[-1]}")
+    save_pickle(splits, splits_file)
+   
+
+if __name__ == '__main__':
+    main()
--- a/projects/Task020_RibFrac/README.md
+++ b/projects/Task020_RibFrac/README.md
+# RibFrac
+**Disclaimer**: We are not the host of the data.
+Please make sure to read the requirements and usage policies of the data and **give credit to the authors of the dataset**!
+
+Please read the information from the homepage carefully and follow the rules and instructions provided by the original authors when using the data.
+- Homepage: https://ribfrac.grand-challenge.org/
+- Subtask: Task 1
+
+## Setup
+0. Follow the installation instructions of nnDetection and create a data directory name `Task020FG_RibFrac`. We added FG to the ID to indicate that we don't distinguish the different classes. (even if you prepare the data set with classes, the data needs to be placed inside that directory)
+1. Follow the instructions and usage policies to download the data and copy the data/labels/csv files to the following locations:
+data -> `Task020FG_RibFrac / raw / imagesTr`; labels -> `Task020FG_RibFrac / raw / labelsTr`; csv files -> `Task020FG_RibFrac / raw`
+2. Run `python prepare.py` in `projects / Task020FG_RibFrac / scripts` of the nnDetection repository.
+
+Note: If no manual split is created, nnDetection will create a random 5Fold split which we used for results.
+
+The data is now converted to the correct format and the instructions from the nnDetection README can be used to train the networks.
--- a/projects/Task020_RibFrac/scripts/prepare.py
+++ b/projects/Task020_RibFrac/scripts/prepare.py
+import os
+import shutil
+from pathlib import Path
+
+import pandas as pd
+
+from nndet.io import save_json
+from nndet.utils.check import env_guard
+from nndet.utils.info import maybe_verbose_iterable
+
+
+def create(
+    image_source: Path,
+    label_source: Path,
+    image_target_dir: Path,
+    label_target_dir: Path,
+    df: pd.DataFrame,
+    fg_only: bool = False,
+    ):
+    image_target_dir.mkdir(parents=True, exist_ok=True)
+    label_target_dir.mkdir(parents=True, exist_ok=True)
+
+    case_id = image_source.stem.rsplit('-', 1)[0]
+    case_id_check = label_source.stem.rsplit('-', 1)[0]
+    assert case_id == case_id_check, f"case ids not matching, found image {case_id} and label {case_id_check}"
+
+    df_case = df.loc[df['public_id'] == case_id]
+    instances = {}
+    for row in df_case.itertuples():
+        _cls = int(row.label_code)
+        if _cls == 0:   # background has label code 0 and lab id 0
+            continue
+
+        if fg_only:
+            _cls = 1
+        elif _cls == -1:
+            _cls = 5
+
+        instances[str(row.label_id)] = _cls - 1  # class range from 0 - 4 // if fg only 0
+        assert 0 < _cls < 6, f"Something strange happened {_cls}"
+    save_json({"instances": instances}, label_target_dir / f"{case_id}.json")
+
+    shutil.copy2(image_source, image_target_dir / f"{case_id}_0000.nii.gz")
+    shutil.copy2(label_source, label_target_dir / f"{case_id}.nii.gz")
+
+
+@env_guard
+def main():
+    det_data_dir = Path(os.getenv('det_data'))
+    task_data_dir = det_data_dir / "Task020_RibFrac"
+    source_data_dir = task_data_dir / "raw"
+
+    if not source_data_dir.is_dir():
+        raise RuntimeError(f"{source_data_dir} should contain the raw data but does not exist.")
+    if not (p := source_data_dir / "imagesTr").is_dir():
+        raise ValueError(f"Expected data to be located at {p}")
+    if not (p := source_data_dir / "labelsTr").is_dir():
+        raise ValueError(f"Expected labels to be located at {p}")
+    if not (p := source_data_dir / "ribfrac-train-info-1.csv").is_file():
+        raise ValueError(f"Expected {p} to exist.")
+    if not (p := source_data_dir / "ribfrac-train-info-2.csv").is_file():
+        raise ValueError(f"Expected {p} to exist.")
+    if not (p := source_data_dir / "ribfrac-val-info.csv").is_file():
+        raise ValueError(f"Expected {p} to exist.")
+
+    target_data_dir = task_data_dir / "raw_splitted" / "imagesTr"
+    target_data_dir.mkdir(exist_ok=True, parents=True)
+    target_label_dir = task_data_dir / "raw_splitted" / "labelsTr"
+    target_label_dir.mkdir(exist_ok=True, parents=True)
+
+    csv_fies = [source_data_dir / "ribfrac-train-info-1.csv",
+                source_data_dir / "ribfrac-train-info-2.csv",
+                source_data_dir / "ribfrac-val-info.csv"]
+    df = pd.concat([pd.read_csv(f) for f in csv_fies])
+
+    image_paths = list((source_data_dir / "imagesTr").glob("*.nii.gz"))
+    image_paths.sort()
+    label_paths = list((source_data_dir / "labelsTr").glob("*.nii.gz"))
+    label_paths.sort()
+    
+    print(f"Found {len(image_paths)} data files and {len(label_paths)} label files.")
+    assert len(image_paths) == len(label_paths)
+
+    meta = {
+        "name": "RibFracFG",
+        "task": "Task020FG_RibFrac",
+        "target_class": None,
+        "test_labels": False,
+        "labels": {"0": "fracture"}, # since we are running FG vs BG this is not completely correct
+        "modalities": {"0": "CT"},
+        "dim": 3,
+    }
+    save_json(meta, task_data_dir / "dataset.json")
+
+    for ip, lp in maybe_verbose_iterable(list(zip(image_paths, label_paths))):
+        create(image_source=ip,
+               label_source=lp,
+               image_target_dir=target_data_dir,
+               label_target_dir=target_label_dir,
+               df=df,
+               fg_only=True,
+               )
+
+
+if __name__ == '__main__':
+    main()
--- a/projects/Task021_ProstateX/README.md
+++ b/projects/Task021_ProstateX/README.md
+# ProstateX
+**Disclaimer**: We are not the host of the data.
+Please make sure to read the requirements and usage policies of the data and **give credit to the authors of the dataset**!
+
+Please read the information from the homepage carefully and follow the rules and instructions provided by the original authors when using the data.
+- Data: https://wiki.cancerimagingarchive.net/display/Public/SPIE-AAPM-NCI+PROSTATEx+Challenges
+- Masks: https://github.com/rcuocolo/PROSTATEx_masks
+
+## Setup
+0. Follow the installation instructions of nnDetection and create a data directory name `Task021_ProstateX`.
+1. Download the data and labels and place them in the following structure:
+
+```text
+{det_data}
+    Task021_ProstateX
+        raw
+            ktrains
+            ProstateX
+            ProstateX-TrainingLesionInformationv2
+            rcuocolo-PROSTATEx_masks-e344452
+```
+
+We used the masks from the git hash e3444521e70cd5e8d405f4e9a6bc08312df8afe7 for our experiments.
+For training only the T2 masks and T2,ADC and bVal high were used for training (no KTrains).
+If you intend to use the Ktrains sequence, simply add it to the `dataset.json` file, the data is already prepared by the script.
+
+2. Run `python prepare.py` in `projects / Task021_ProstateX / scripts` of the nnDetection repository.
+
+The data is now converted to the correct format and the instructions from the nnDetection README can be used to train the networks.
+
+Note: Since ProstateX only contains a fairly small number of clinically significant lesions and we used a 30% test split, we observed a fairly high variance in the performance of our runs.
--- a/projects/Task021_ProstateX/scripts/prepare.py
+++ b/projects/Task021_ProstateX/scripts/prepare.py
+import os
+import sys
+import traceback
+from itertools import repeat
+from multiprocessing import Pool
+from pathlib import Path
+
+import pandas as pd
+import SimpleITK as sitk
+from nndet.io.prepare import create_test_split
+from loguru import logger
+
+from nndet.utils.check import env_guard
+from nndet.io import save_json, save_yaml
+from nndet.io.itk import load_sitk, load_sitk_as_array, copy_meta_data_itk
+from nndet.utils.info import maybe_verbose_iterable
+
+
+def load_dicom_series_sitk(p):
+    reader = sitk.ImageSeriesReader()
+    dicom_names = reader.GetGDCMSeriesFileNames(str(p))
+    reader.SetFileNames(dicom_names)
+    return reader.Execute()
+
+
+def prepare_case(case_id,
+                 data_dirs,
+                 ktrans_dirs,
+                 t2_masks,
+                 df_labels,
+                 df_masks,
+                 data_target,
+                 label_target,
+                 ):
+    try:
+        logger.info(f"Preparing {case_id}")
+
+        tmp_dir = data_dirs / case_id
+        _dirs = [f for f in tmp_dir.iterdir() if f.is_dir()]
+        assert len(_dirs) == 1
+        data_dir = tmp_dir / _dirs[0]
+
+        df_mask_case = df_masks[df_masks['T2'].str.contains(case_id)]
+        assert len(df_mask_case) == 1
+
+        t2_mask_file = df_mask_case.iloc[0]["T2"]
+        assert f"{case_id}" in t2_mask_file
+        t2_series_id = int(t2_mask_file.rsplit(".", 2)[0].rsplit('_', 1)[1])
+
+        adc_mask_file = df_mask_case.iloc[0]["ADC"]
+        assert f"{case_id}" in adc_mask_file
+        if case_id == "ProstateX-0025":
+            # case 0025 has a 7a inside the table
+            adc_series_id = 7
+            assert adc_mask_file.endswith("7a.nii.gz")
+        elif case_id == "ProstateX-0113":
+            # even though the table shows 9 as the series
+            # ID we use 10 because 9 is not an ADC file?
+            adc_series_id = int(adc_mask_file.rsplit(".", 2)[0].rsplit('_', 1)[1])
+            assert adc_series_id == 9
+            adc_series_id = 10
+        else:
+            adc_series_id = int(adc_mask_file.rsplit(".", 2)[0].rsplit('_', 1)[1])
+
+        # T2
+        t2_dir = [f for f in data_dir.glob("*t2*") if f.name.startswith(f"{t2_series_id}.")]
+        assert len(t2_dir) == 1
+        t2_data_itk = load_dicom_series_sitk(t2_dir[0])
+
+        # ADC
+        adc_dir = [f for f in data_dir.glob("*ADC*") if f.name.startswith(f"{adc_series_id}.")]
+        assert len(adc_dir) == 1
+        adc_data_itk = load_dicom_series_sitk(adc_dir[0])
+
+        # PD-W
+        pdw_dir = sorted(data_dir.glob("* PD *"))[-1]
+        pdw_data_itk = load_dicom_series_sitk(pdw_dir)
+
+        # k-trans
+        ktrans_dir = ktrans_dirs / case_id
+        ktrans_data_itk = load_sitk(ktrans_dir / f"{case_id}-Ktrans.mhd")
+
+        # resample data to t2 (only early fusion is currently supported)
+        resampler = sitk.ResampleImageFilter()  # default linear
+        resampler.SetReferenceImage(t2_data_itk)
+        adc_data_itk_res = resampler.Execute(adc_data_itk)
+        pdw_data_itk_res = resampler.Execute(pdw_data_itk)
+        ktrans_data_itk_res = resampler.Execute(ktrans_data_itk)
+
+        # prepare mask
+        mask_paths = list(t2_masks.glob(f"{case_id}*"))
+        fids = [int([l for l in mp.name.split("-") if "Finding" in l][0][7:]) for mp in mask_paths]
+        mask_itk = load_sitk(str(mask_paths[0]))
+        mask = sitk.GetArrayFromImage(mask_itk)
+        mask[mask > 0] = 1
+
+        for idx, mp in enumerate(mask_paths[1:], start=2):
+            _mask = load_sitk_as_array(str(mp))[0]
+            mask[_mask > 0] = idx
+
+        mask_final = sitk.GetImageFromArray(mask)
+        copy_meta_data_itk(t2_data_itk, mask_final)
+
+        df_case = df_labels.loc[df_labels['ProxID'] == case_id]
+        instances = {}
+        for row in df_case.itertuples():
+            if row.fid in fids:
+                instances[fids.index(int(row.fid)) + 1] = int(row.ClinSig)
+            else:
+                logger.info(f"Found removed fid {row.fid} in {case_id}")
+
+        # save
+        sitk.WriteImage(t2_data_itk, str(data_target / f"{case_id}_0000.nii.gz"))
+        sitk.WriteImage(adc_data_itk_res, str(data_target / f"{case_id}_0001.nii.gz"))
+        sitk.WriteImage(pdw_data_itk_res, str(data_target / f"{case_id}_0002.nii.gz"))
+        sitk.WriteImage(ktrans_data_itk_res, str(data_target / f"{case_id}_0003.nii.gz"))
+        sitk.WriteImage(mask_final, str(label_target / f"{case_id}.nii.gz"))
+        save_json({"instances": instances}, label_target / f"{case_id}.json")
+    except Exception as e:
+        logger.error(f"Case {case_id} failed with {e} and {traceback.format_exc()}")
+
+
+@env_guard
+def main():
+    """
+    Does not use the KTrans Sequence of ProstateX
+    This script only uses the provided T2 masks
+    """
+    det_data_dir = Path(os.getenv('det_data'))
+    task_data_dir = det_data_dir / "Task021_ProstateX"
+
+    # setup raw paths
+    source_data_dir = task_data_dir / "raw"
+    if not source_data_dir.is_dir():
+        raise RuntimeError(f"{source_data_dir} should contain the raw data but does not exist.")
+
+    source_data = source_data_dir / "PROSTATEx"
+    source_masks = source_data_dir / "rcuocolo-PROSTATEx_masks-e344452"
+    source_ktrans = source_data_dir / "ktrains"
+    csv_labels = source_data_dir / "ProstateX-TrainingLesionInformationv2" / "ProstateX-Findings-Train.csv"
+    csv_masks = source_data_dir / "rcuocolo-PROSTATEx_masks-e344452" / "Files" / "Image_list.csv"
+
+    data_target = task_data_dir / "raw_splitted" / "imagesTr"
+    data_target.mkdir(parents=True, exist_ok=True)
+    label_target = task_data_dir / "raw_splitted" / "labelsTr"
+    label_target.mkdir(parents=True, exist_ok=True)
+
+    logger.remove()
+    logger.add(sys.stdout, format="{level} {message}", level="INFO")
+    logger.add(data_target.parent.parent / "prepare.log", level="DEBUG")
+
+    base_masks = source_masks / "Files" / "Masks"
+    t2_masks = base_masks / "T2"
+
+    df_labels = pd.read_csv(csv_labels)
+    df_masks = pd.read_csv(csv_masks)
+    case_ids = [f.stem.split("-", 2)[:2] for f in t2_masks.glob("*nii.gz")]
+    case_ids = list(set([f"{c[0]}-{c[1]}" for c in case_ids]))
+    logger.info(f"Found {len(case_ids)} cases")
+
+    # save meta
+    logger.info("Saving dataset info")
+    dataset_info = {
+        "name": "ProstateX",
+        "task": "Task021_ProstateX",
+
+        "target_class": None,
+        "test_labels": False,
+
+        "labels": {
+            "0": "clinically_significant",
+            "1": "clinically_insignificant",
+        },
+        "modalities": {
+            "0": "T2",
+            "1": "ADC",
+            "2": "PD-W",
+            "3": "Ktrans"
+        },
+        "dim": 3,
+        "info": "Ground Truth: T2 Masks; \n"
+                "Modalities: T2, ADC, PD-W, Ktrans \n;"
+                "Classes: clinically significant = 1, insignificant = 0 \n"
+                "Keep: ProstateX-0025 '10-28-2011-MR prostaat kanker detectie WDSmc MCAPRODETW-19047'\n"
+                "Masks\n"
+                "https://github.com/rcuocolo/PROSTATEx_masks\n"
+                "Github hash: e3444521e70cd5e8d405f4e9a6bc08312df8afe7"
+    }
+    save_json(dataset_info, task_data_dir / "dataset.json")
+
+    # prepare labels and data
+    for cid in maybe_verbose_iterable(case_ids):
+        prepare_case(cid,
+                     data_dirs=source_data,
+                     ktrans_dirs=source_ktrans,
+                     t2_masks=t2_masks,
+                     df_labels=df_labels,
+                     df_masks=df_masks,
+                     data_target=data_target,
+                     label_target=label_target,
+                     )
+
+    # with Pool(processes=6) as p:
+    #     p.starmap(prepare_case, zip(case_ids,
+    #                                 repeat(source_data),
+    #                                 repeat(source_ktrans),
+    #                                 repeat(t2_masks),
+    #                                 repeat(df_labels),
+    #                                 repeat(df_masks),
+    #                                 repeat(data_target),
+    #                                 repeat(label_target),
+    #                                 ))
+
+    # create test split
+    create_test_split(task_data_dir / "raw_splitted",
+                      num_modalities=len(dataset_info["modalities"]),
+                      test_size=0.3,
+                      random_state=0,
+                      shuffle=True,
+                      )
+
+
+if __name__ == '__main__':
+    main()
--- a/projects/Task025_LymphNodes/README.md
+++ b/projects/Task025_LymphNodes/README.md
+# LymphNodes
+**Disclaimer**: We are not the host of the data.
+Please make sure to read the requirements and usage policies of the data and **give credit to the authors of the dataset**!
+
+Please read the information from the homepage carefully and follow the rules and instructions provided by the original authors when using the data.
+- Homepage: https://wiki.cancerimagingarchive.net/display/Public/CT+Lymph+Nodes
+- Masks: we used the masks provided by the same page
+
+## Setup
+0. Follow the installation instructions of nnDetection and create a data directory name `Task025_LymphNodes`.
+1. Down the data and labels and place the data into `Task025_LymphNodes / raw / CT Lymph Nodes` and the labels into `Task025_LymphNodes / raw / MED_ABD_LYMPH_MASKS`
+2. Run `python prepare.py` in `projects / Task025_LymphNodes / scripts` of the nnDetection repository.
+
+The data is now converted to the correct format and the instructions from the nnDetection README can be used to train the networks.
--- a/projects/Task025_LymphNodes/scripts/prepare.py
+++ b/projects/Task025_LymphNodes/scripts/prepare.py
+import os
+import shutil
+import sys
+from itertools import repeat
+from multiprocessing import Pool
+from pathlib import Path
+from nndet.utils.check import env_guard
+
+import numpy as np
+from loguru import logger
+import SimpleITK as sitk
+
+from nndet.io import save_json
+from nndet.io.prepare import create_test_split
+from nndet.io.itk import load_sitk_as_array
+from nndet.utils.info import maybe_verbose_iterable
+
+
+def prepare_image(
+        case_id: str,
+        base_dir: Path,
+        mask_dir: Path,
+        raw_splitted_dir: Path,
+):
+    logger.info(f"Processing {case_id}")
+    root_data_dir = base_dir / case_id
+    patient_data_dir = []
+    for root, dirs, files in os.walk(root_data_dir, topdown=False):
+        if any([f.endswith(".dcm") for f in files]):
+            patient_data_dir.append(Path(root))
+    assert len(patient_data_dir) == 1
+    patient_data_dir = patient_data_dir[0]
+
+    reader = sitk.ImageSeriesReader()
+    dicom_names = reader.GetGDCMSeriesFileNames(str(patient_data_dir))
+    reader.SetFileNames(dicom_names)
+    data_itk = reader.Execute()
+
+    patient_label_dir = mask_dir / case_id
+    label_path = [p for p in patient_label_dir.iterdir() if p.is_file() and p.name.endswith(".nii.gz")]
+    assert len(label_path) == 1
+    label_path = label_path[0]
+
+    mask = load_sitk_as_array(label_path)[0]
+    instances = np.unique(mask)
+    instances = instances[instances > 0]
+    meta = {"instances": {str(int(i)): 0 for i in instances}}
+    meta["original_path_data"] = str(patient_data_dir)
+    meta["original_path_label"] = str(label_path)
+
+    save_json(meta, raw_splitted_dir / "labelsTr" / f"{case_id}.json")
+
+    sitk.WriteImage(data_itk, str(raw_splitted_dir / "imagesTr" / f"{case_id}_0000.nii.gz"))
+    shutil.copy(label_path, raw_splitted_dir / "labelsTr" / f"{case_id}.nii.gz")
+
+
+
+@env_guard
+def main():
+    det_data_dir = Path(os.getenv("det_data"))
+    task_data_dir = det_data_dir / "Task025_LymphNodes"
+    source_data_base = task_data_dir / "raw"
+    if not source_data_base.is_dir():
+        raise RuntimeError(f"{source_data_base} should contain the raw data but does not exist.")
+
+    raw_splitted_dir = task_data_dir / "raw_splitted"
+    (raw_splitted_dir / "imagesTr").mkdir(parents=True, exist_ok=True)
+    (raw_splitted_dir / "labelsTr").mkdir(parents=True, exist_ok=True)
+    (raw_splitted_dir / "imagesTs").mkdir(parents=True, exist_ok=True)
+    (raw_splitted_dir / "labelsTs").mkdir(parents=True, exist_ok=True)
+
+    logger.remove()
+    logger.add(sys.stdout, format="{level} {message}", level="DEBUG")
+    logger.add(raw_splitted_dir.parent / "prepare.log", level="DEBUG")
+
+    meta = {
+        "name": "Lymph Node TCIA",
+        "task": "Task025_LymphNodes",
+
+        "target_class": None,
+        "test_labels": True,
+
+        "labels": {
+            "0": "LymphNode",
+        },
+        "modalities": {
+            "0": "CT",
+        },
+        "dim": 3,
+    }
+
+    save_json(meta, raw_splitted_dir.parent / "dataset.json")
+
+    base_dir = source_data_base / "CT Lymph Nodes"
+    mask_dir = source_data_base / "MED_ABD_LYMPH_MASKS"
+
+    case_ids = sorted([p.name for p in base_dir.iterdir() if p.is_dir()])
+    logger.info(f"Found {len(case_ids)} cases in {base_dir}")
+
+    for cid in maybe_verbose_iterable(case_ids):
+        prepare_image(
+            case_id=cid,
+            base_dir=base_dir,
+            mask_dir=mask_dir,
+            raw_splitted_dir=raw_splitted_dir,
+        )
+
+    # with Pool(processes=6) as p:
+    #     p.starmap(
+    #         prepare_image,
+    #         zip(
+    #             case_ids,
+    #             repeat(base_dir),
+    #             repeat(mask_dir),
+    #             repeat(raw_splitted_dir)
+    #         )
+    #     )
+
+    create_test_split(raw_splitted_dir,
+                      num_modalities=len(meta["modalities"]),
+                      test_size=0.3,
+                      random_state=0,
+                      shuffle=True,
+                      )
+
+
+if __name__ == '__main__':
+    main()
--- a/requirements.txt
+++ b/requirements.txt
+pytorch_lightning
+scipy
+scikit-learn
+scikit-image>=0.14
+batchgenerators>=0.19.4
+pandas>=0.8.1
+PyYAML>=5.1, !=5.4.*
+nevergrad
+nnunet
+dicom2nifti
+medpy
+SimpleITK
+tqdm
+loguru
+hydra-core
+mlflow
+GitPython
+matplotlib
+seaborn
\ No newline at end of file
--- a/scripts/consolidate.py
+++ b/scripts/consolidate.py
+"""
+Copyright 2020 Division of Medical Image Computing, German Cancer Research Center (DKFZ), Heidelberg, Germany
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import argparse
+import shutil
+import sys
+import os
+from pathlib import Path
+from typing import Sequence
+
+from loguru import logger
+from nndet.utils.check import env_guard
+from omegaconf import OmegaConf
+
+from nndet.ptmodule import MODULE_REGISTRY
+from nndet.inference.sweeper import BoxSweeper
+from nndet.inference.loading import get_latest_model
+from nndet.inference.ensembler.base import extract_results
+from nndet.io import get_task, load_pickle, save_pickle
+
+
+def consolidate_models(source_dirs: Sequence[Path], target_dir: Path, ckpt: str):
+    """
+    Copy final models from folds into consolidated folder
+
+    Args:
+        source_dirs: directory of each fold to consolidate
+        target_dir: directory to save models to
+        ckpt: checkpoint identifier to select models for ensembling
+    """
+    for fold, sd in enumerate(source_dirs):
+        model_paths = list(sd.glob('*.ckpt'))
+        found_models = [mp for mp in model_paths if ckpt in str(mp.stem)]
+        assert len(found_models) == 1, f"Found wrong number of models, {found_models}"
+        model_path = found_models[0]
+        assert f"fold{fold}" in str(model_path.parent.stem), f"Expected fold {fold} but found {model_path}"
+        shutil.copy2(model_path, target_dir / f"model_fold{fold}.ckpt")
+
+
+def consolidate_predictions(
+    source_dirs: Sequence[Path],
+    target_dir: Path,
+    consolidate: str,
+    ):
+    """
+    Consolidate sweep states to find new postprocessing hyperparameters
+
+    Args:
+        source_dirs: directory of each fold
+        target_dir: directory of condolidated models
+        consolidate: consolidation mode
+    """
+    if consolidate == 'export':
+        logger.info("Consolidating sweep states for refinement.")
+        postfix = "sweep_predictions"
+    elif consolidate == 'copy':
+        logger.info("Consolidating val predictions for evaluation")
+        postfix = "val_predictions"
+    else:
+        raise ValueError(f"Consolidation {consolidate} is not supported")
+    pred_dir = target_dir / postfix
+    pred_dir.mkdir(parents=True, exist_ok=True)
+    for source_dir in source_dirs:
+        for p in [p for p in (source_dir / postfix).iterdir() if p.is_file()]:
+            shutil.copy(p, pred_dir)
+
+
+@env_guard
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('task', type=str,
+                        help="Task id e.g. Task12_LIDC OR 12 OR LIDC",
+                        )
+    parser.add_argument('model', type=str,
+                        help="model name, e.g. RetinaUNetV0",
+                        )
+    parser.add_argument('-o', '--overwrites', type=str, nargs='+', required=False,
+                        help="overwrites for config file. Only needed in case of box eval",
+                        )
+    parser.add_argument('-c', '--consolidate', type=str, default="export", required=False,
+                        help=("Determines how to consolidate predictions: 'export' or 'copy'. "
+                              "'copy' will copy the predictions of each fold into the directory for evaluation. "
+                              "'export' will use the updated parameters after consolidation to update the "
+                              "predictions and export them. This is only supported if one of the "
+                              "sweep settings is active! Default: export"),
+                        )
+    parser.add_argument('--num_folds', type=int, default=5, required=False,
+                        help="Number of folds. Default: 5",
+                        )
+    parser.add_argument('--no_model', action="store_false",
+                        help="Deactivate if consolidating nnUNet results",
+                        )
+    parser.add_argument('--sweep_boxes', action="store_true",
+                        help="Sweep for best parameters for bounding box based models",
+                        )
+    parser.add_argument('--sweep_instances', action="store_true",
+                        help="Sweep for best parameters for instance segmentation based models",
+                        )
+    parser.add_argument('--ckpt', type=str, default="last", required=False,
+                        help="Define identifier of checkpoint for consolidation. "
+                        "Use this with care!")
+
+    args = parser.parse_args()
+    model = args.model
+    task = args.task
+    ov = args.overwrites
+
+    consolidate = args.consolidate
+    num_folds = args.num_folds
+    do_model_consolidation = args.no_model
+
+    sweep_boxes = args.sweep_boxes
+    sweep_instances = args.sweep_instances
+    ckpt = args.ckpt
+
+    if consolidate == "export" and not (sweep_boxes or sweep_instances):
+        raise ValueError("Export needs new parameter sweep! Actiate one of the sweep "
+                         "arguments or change to copy mode")
+
+    task_dir = Path(os.getenv("det_models")) / get_task(task, name=True, models=True)
+    model_dir = task_dir / model
+    if not model_dir.is_dir():
+        raise ValueError(f"{model_dir} does not exist")
+    target_dir = model_dir / "consolidated"
+
+    logger.remove()
+    logger.add(sys.stdout, format="{level} {message}", level="INFO")
+    logger.add(Path(target_dir) / "consolidate.log", level="DEBUG")
+
+    logger.info(f"looking for models in {model_dir}")
+    training_dirs = [get_latest_model(model_dir, fold) for fold in range(num_folds)]
+    logger.info(f"Found training dirs: {training_dirs}")
+
+    # model consolidation
+    if do_model_consolidation:
+        logger.info("Consolidate models")
+        if ckpt != "last":
+            logger.warning(f"Found ckpt overwrite {ckpt}, this is not the default, "
+                           "this can drastically influence the performance!")
+        consolidate_models(training_dirs, target_dir, ckpt)
+
+    # consolidate predictions
+    logger.info("Consolidate predictions")
+    consolidate_predictions(
+        source_dirs=training_dirs,
+        target_dir=target_dir,
+        consolidate=consolidate,
+        )
+
+    shutil.copy2(training_dirs[0] / "plan.pkl", target_dir)
+    shutil.copy2(training_dirs[0] / "config.yaml", target_dir)
+
+    # invoke new parameter sweeps
+    cfg = OmegaConf.load(str(target_dir / "config.yaml"))
+    ov = ov if ov is not None else []
+    ov.append("host.parent_data=${env:det_data}")
+    ov.append("host.parent_results=${env:det_models}")
+    if ov is not None:
+        cfg.merge_with_dotlist(ov)
+
+    preprocessed_output_dir = Path(cfg["host"]["preprocessed_output_dir"])
+    plan = load_pickle(target_dir / "plan.pkl")
+    gt_dir = preprocessed_output_dir / plan["data_identifier"] / "labelsTr"
+
+    if sweep_boxes:
+        logger.info("Sweeping box predictions")
+        module = MODULE_REGISTRY[cfg["module"]]
+        ensembler_cls = module.get_ensembler_cls(
+            key="boxes", dim=plan["network_dim"])  # TODO: make this configurable
+
+        sweeper = BoxSweeper(
+            classes=[item for _, item in cfg["data"]["labels"].items()],
+            pred_dir=target_dir / "sweep_predictions",
+            gt_dir=gt_dir,
+            target_metric=cfg["trainer_cfg"].get("eval_score_key",
+                                                 "mAP_IoU_0.10_0.50_0.05_MaxDet_100"),
+            ensembler_cls=ensembler_cls,
+            save_dir=target_dir / "sweep",
+        )
+        inference_plan = sweeper.run_postprocessing_sweep()
+    elif sweep_instances:
+        raise NotImplementedError
+
+    plan = load_pickle(target_dir / "plan.pkl")
+    if consolidate != 'copy':
+        plan["inference_plan"] = inference_plan
+        save_pickle(plan, target_dir / "plan_inference.pkl")
+
+        for restore in [True, False]:
+            export_dir = target_dir / "val_predictions" if restore else \
+                target_dir / "val_predictions_preprocessed"
+            extract_results(
+                source_dir=target_dir / "sweep_predictions",
+                target_dir=export_dir,
+                ensembler_cls=ensembler_cls,
+                restore=restore,
+                **inference_plan,
+                )
+    else:
+        logger.warning("Plan used from fold 0, not updated with consolidation")
+        save_pickle(plan, target_dir / "plan_inference.pkl")
+
+if __name__ == '__main__':
+    main()
--- a/scripts/convert_cls2fg.py
+++ b/scripts/convert_cls2fg.py
+"""
+Copyright 2020 Division of Medical Image Computing, German Cancer Research Center (DKFZ), Heidelberg, Germany
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import argparse
+import shutil
+import sys
+from pathlib import Path
+from tokenize import PseudoExtras
+
+from hydra.experimental import initialize_config_module
+from loguru import logger
+
+from nndet.io import get_task, load_json, save_json
+from nndet.utils.config import compose, load_dataset_info
+from nndet.utils.check import env_guard
+
+
+def convert_raw(task, overwrite, ov):
+    task_name_full = get_task(task, name=True)
+    task_num, task_name = task_name_full[4:].split('_', 1)
+    new_task_name_full = f"Task{task_num}FG_{task_name}"
+
+    cfg = compose(task, "config.yaml", overrides=ov if ov is not None else [])
+    print(cfg.pretty())
+
+    source_splitted_dir = Path(cfg["host"]["splitted_4d_output_dir"])
+    target_splitted_dir = Path(str(source_splitted_dir).replace(task_name_full, new_task_name_full))
+    if target_splitted_dir.is_dir() and overwrite:
+        shutil.rmtree(target_splitted_dir)
+    target_splitted_dir.mkdir(parents=True)
+
+    logger.remove()
+    logger.add(sys.stdout, level="INFO")
+    logger.add(target_splitted_dir.parent / "convert_cls2fg.log", level="DEBUG")
+
+    # update dataset_info
+    source_data_info = Path(cfg["host"]["data_dir"])
+    data_info = load_dataset_info(source_data_info)
+    data_info.pop("labels")
+    data_info["labels"] = {"0": "fg"}
+    data_info["task"] = new_task_name_full
+    save_json(data_info, target_splitted_dir.parent / "dataset.json", indent=4)
+
+    for postfix in ["Tr", "Ts"]:
+        source_image_dir = source_splitted_dir / f"images{postfix}"
+        source_label_dir = source_splitted_dir / f"labels{postfix}"
+
+        if not source_image_dir.is_dir():
+            logger.info(f"{source_image_dir} is not a dir. Skipping it.")
+            continue
+
+        # copy images and labels
+        shutil.copytree(source_image_dir, target_splitted_dir / f"images{postfix}")
+        shutil.copytree(source_label_dir, target_splitted_dir / f"labels{postfix}")
+
+        # remap properties file to foreground class
+        target_label_dir = target_splitted_dir / f"labels{postfix}"
+        for f in [l for l in target_label_dir.glob("*.json")]:
+            props = load_json(f)
+            props["instances"] = {key: 0 for key in props["instances"].keys()}
+            save_json(props, f)
+
+
+@env_guard
+def main():
+    """
+    Convert raw splitted data with class sensitive annotations into
+    a new dataset which only distinguishes fg and bg
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument('tasks', type=str, nargs='+',
+                        help="Single or multiple task identifiers to process consecutively",
+                        )
+    parser.add_argument('--overwrite', action='store_true')
+    parser.add_argument('-o', '--overwrites', type=str, nargs='+',
+                        help="overwrites for config file",
+                        required=False)
+    args = parser.parse_args()
+    tasks = args.tasks
+    ov = args.overwrites
+    overwrite = args.overwrite
+    initialize_config_module(config_module="nndet.conf")
+
+    for task in tasks:
+        convert_raw(task, overwrite, ov)
+
+
+if __name__ == '__main__':
+    main()
--- a/scripts/convert_seg2det.py
+++ b/scripts/convert_seg2det.py
+"""
+Copyright 2020 Division of Medical Image Computing, German Cancer Research Center (DKFZ), Heidelberg, Germany
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import argparse
+import sys
+from datetime import datetime
+from itertools import repeat
+from multiprocessing import Pool
+from pathlib import Path
+from typing import Sequence
+
+import numpy as np
+import SimpleITK as sitk
+from hydra.experimental import initialize_config_module
+from loguru import logger
+from scipy import ndimage
+from scipy.ndimage import label
+from tqdm import tqdm
+
+from nndet.core.boxes import box_size_np
+from nndet.io import get_case_ids_from_dir, load_json, save_json
+from nndet.io.transforms.instances import get_bbox_np
+from nndet.io.itk import copy_meta_data_itk, load_sitk, load_sitk_as_array
+from nndet.utils.config import compose
+
+
+def prepare_detection_label(case_id: str,
+                            label_dir: Path,
+                            things_classes: Sequence[int],
+                            stuff_classes: Sequence[int],
+                            min_size: float = 0,
+                            min_vol: float = 0,
+                            ):
+    if (label_dir / f"{case_id}.json").is_file():
+            logger.info(f"Found existing case {case_id} -> skipping")
+            return
+    logger.info(f"Processing {case_id}")
+    seg_itk = load_sitk(label_dir / f"{case_id}.nii.gz")
+    spacing = np.asarray(seg_itk.GetSpacing())[::-1]
+    seg = sitk.GetArrayFromImage(seg_itk)
+
+    # prepare stuff information
+    stuff_seg = np.zeros_like(seg)
+    if stuff_classes:
+        for new_class, old_class in enumerate(stuff_classes, start=1):
+            stuff_seg[seg == old_class] = new_class
+        stuff_seg_itk = copy_meta_data_itk(seg_itk, sitk.GetImageFromArray(stuff_seg))
+        sitk.WriteImage(stuff_seg_itk, str(label_dir / f"{case_id}_stuff.nii.gz"))
+
+    # prepare things information
+    structure = np.ones([3] * seg.ndim)
+    things_seg = np.copy(seg)
+    things_seg[stuff_seg > 0] = 0  # remove all stuff classes from segmentation
+
+    instances_not_filtered, _ = label(things_seg, structure=structure)
+    final_mapping = {}
+    if instances_not_filtered.max() > 0:
+        boxes = get_bbox_np(instances_not_filtered[None])["boxes"]
+        box_sizes = box_size_np(boxes)
+
+        instance_ids = np.unique(instances_not_filtered)
+        instance_ids = instance_ids[instance_ids > 0]
+
+        assert len(instance_ids) == len(boxes)
+        isotopic_axis = list(range(seg.ndim))
+        isotopic_axis.pop(np.argmax(spacing))
+        instances = np.zeros_like(instances_not_filtered)
+
+        start_id = 1
+        for iid, bsize in zip(instance_ids, box_sizes):
+            bsize_world = bsize * spacing
+            instance_mask = (instances_not_filtered == iid)
+            instance_vol = instance_mask.sum()
+
+            if all(bsize_world[isotopic_axis] > min_size) and (instance_vol > min_vol):
+                instances[instance_mask] = start_id
+
+                single_idx = np.argwhere(instance_mask)[0]
+                semantic_class = int(seg[tuple(single_idx)])
+                final_mapping[start_id] = things_classes.index(semantic_class)
+
+                start_id += 1
+    else:
+        instances = np.zeros_like(instances_not_filtered)
+
+    final_instances_itk = copy_meta_data_itk(seg_itk, sitk.GetImageFromArray(instances))
+    sitk.WriteImage(final_instances_itk, str(label_dir / f"{case_id}.nii.gz"))
+    save_json({"instances": final_mapping}, label_dir / f"{case_id}.json")
+
+    sitk.WriteImage(seg_itk, str(label_dir / f"{case_id}_orig.nii.gz"))
+
+
+if __name__ == '__main__':
+    """
+    This script converts a semantic segmentation dataset into an instance
+    segmentation dataset by using connected components on the labels.
+    To account for separated pixels inside the annotations only annotations
+    with a specified minimal size are converted into objects.
+    
+    The data needs to be in the same format as in nnunet: images
+    stay the same, labels will be semantic segmentations.
+
+    ============================================================================
+    ================================IMPORTANT==================================+
+    ============================================================================  
+    Needs additional information from dataset.json/.yaml:
+        `seg2det_stuff`: these are classes which are interpreted semantically
+        `seg2det_things`: these are classes which are interpreted as instances
+         Both entries should be lists with the indices of the respective
+         classes where the position will determine its new class
+         e.g.
+            `seg2det_stuff`: [2,] -> remap class 2 from semantic segmentation
+                to new stuff class 1 (stuff classes start at one)
+            `seg2det_things`: [1, 3] -> remap class 1 and 3 from semantic
+                segmentation to new things classes 0 and 1, respectively
+            `min_size`: minimum size in mm of objects in the isotropic axis (default 0)
+            `min_vol`: minimum volume of instances in pixels (default 0)
+    ============================================================================
+
+    The segmentation labels will be splitted into things (classes to detect)
+    and  stuff classes (additional segmentation labels) and will be saved
+    as separate files.
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument('tasks', type=str, nargs='+',
+                        help="Single or multiple task identifiers to process consecutively",
+                        )
+    parser.add_argument('--overwrite', action='store_true')
+    parser.add_argument('-o', '--overwrites', type=str, nargs='+',
+                        help="overwrites for config file",
+                        required=False,
+                        )
+    parser.add_argument('--volume_ranking',
+                        help="Create a ranking of instances based on their volume",
+                        action='store_true',
+                        )
+    parser.add_argument('--num_processes', type=int, default=4, required=False,
+                        help="Number of processes to use for conversion.")
+
+    args = parser.parse_args()
+    tasks = args.tasks
+    ov = args.overwrites
+    overwrite = args.overwrite
+    do_volume_ranking = args.volume_ranking
+    num_processes = args.num_processes
+    initialize_config_module(config_module="nndet.conf")
+
+    for task in tasks:
+        cfg = compose(task, "config.yaml", overrides=ov if ov is not None else [])
+        print(cfg.pretty())
+
+        splitted_dir = Path(cfg["host"]["splitted_4d_output_dir"])
+
+        logger.remove()
+        logger.add(sys.stdout, level="INFO")
+        logger.add(splitted_dir / "convert_seg2det.log", level="DEBUG")
+        logger.info(f"+++++ Running covnersion: {datetime.now()} +++++")
+        logger.info(f"Running min_size {cfg['data'].get('min_size', 0)} and "
+                    f"min_vol {cfg['data'].get('min_vol', 0)}")
+
+        for postfix in ["Tr", "Ts"]:
+            label_dir = splitted_dir / f"labels{postfix}"
+            case_ids = [f.name[:-7] for f in label_dir.glob("*.nii.gz")]
+            logger.info(f"Found {len(case_ids)} cases for conversion with postfix {postfix}.")
+
+            # for cid in case_ids:
+            #     prepare_detection_label(case_id=cid,
+            #                             label_dir=label_dir,
+            #                             stuff_classes=cfg["data"]["seg2det_stuff"],
+            #                             things_classes=cfg["data"]["seg2det_things"],
+            #                             min_size=cfg["data"].get("min_size", 0),
+            #                             min_vol=cfg["data"].get("min_vol", 0),
+            #                             )
+
+            with Pool(processes=num_processes) as p:
+                p.starmap(prepare_detection_label, zip(
+                    case_ids,
+                    repeat(label_dir),
+                    repeat(cfg["data"]["seg2det_things"]),
+                    repeat(cfg["data"]["seg2det_stuff"]),
+                    repeat(cfg["data"].get("min_size", 0)),
+                    repeat(cfg["data"].get("min_vol", 0)),
+                ))
+
+        if do_volume_ranking:
+            for postfix in ["Tr", "Ts"]:
+                if (label_dir := splitted_dir / f"labels{postfix}").is_dir():
+                    ranking = []
+                    for case_id in tqdm([f.stem for f in label_dir.glob("*.json")]):
+                        instances = load_sitk_as_array(label_dir / f"{case_id}.nii.gz")[0]
+                        instance_ids, instance_counts = np.unique(instances, return_counts=True)
+                        cps = [np.argwhere(instances == iid)[0].tolist() for iid in instance_ids[1:]]
+                        assert len(instance_ids) - 1 == len(cps)
+                        tmp = [{"case_id": str(case_id), "instance_id": int(iid),
+                                "vol": int(vol), "cp": list(cp)[::-1]}
+                               for iid, vol, cp in zip(instance_ids[1:], instance_counts[1:], cps)]
+                        ranking.extend(tmp)
+                    ranking = sorted(ranking, key=lambda x: x["vol"])
+                    save_json(ranking, splitted_dir / f"volume_ranking_{postfix}.json")
+                else:
+                    logger.info(f"Did not find dir {label_dir} for volume ranking")