LIDC, Luna, Decathlon Guide

95642b36 · mibaumgartner · a3d8812a · 95642b36 · 95642b36 · 95642b36
Commit 95642b36 authored May 17, 2021 by mibaumgartner
9 changed files
--- a/README.md
+++ b/README.md
@@ -152,7 +152,7 @@ Furthermore, we provide pretrained models which can be used without investing la

 | <!-- --> | <!-- --> | <!-- --> |
 |:--------:|:--------:|:--------:|
-| | [nnDetection V0.1](/docs/results/nnDetectionV001.md) | |
+| | [nnDetection v0.1](/docs/results/nnDetectionV001.md) | |

 </div>


--- a/projects/Task001_Decathlon/README.md
+++ b/projects/Task001_Decathlon/README.md
+# Decathlon
+**Disclaimer**: We are not the host of the data.
+Please make sure to read the requirements and usage policies of the data and **give credit to the authors of the dataset**!
+
+Please read the information from the homepage carefully and follow the rules and instructions provided by the original authors when using the data.
+- Homepage: http://medicaldecathlon.com/
+
+## Setup
+0. Follow the installation instructions of nnDetection and create the data directories for the intended tasks, e.g. `Task003_Liver`.
+1. Follow the instructions and usage policies to download the data and place the images, labels and dataset.json files inside the raw folder of the respective tasks, e.g. imagesTr -> `Task003_Liver / raw / imagesTr`, labelsTr -> `Task003_Liver / raw / labelsTr` and dataset.json -> `Task003_Liver / raw / dataset.json`
+2. Run `python prepare.py [tasks]` in `projects / Task001_Decathlion / scripts` of the nnDetection repository, e.g. to prepare all tasks: `python prepare.py Task003_Liver Task007_Pancreas Task008_HepaticVessel Task010_Colon`
+3. Run `nndet_seg2det [tasks]` to convert the semantic segmentation labels to instance segmentations, e.g. to convert all tasks `nndet_seg2det 003 007 008 010`
+4. Run ... to download and replace the manually corrected labels. # TODO
+
+The data is now converted to the correct format and the instructions from the nnDetection README can be used to train the networks.
--- a/projects/Task001_Decathlon/scripts/prepare.py
+++ b/projects/Task001_Decathlon/scripts/prepare.py
+import argparse
+import os
+import shutil
+import sys
+from itertools import repeat
+from multiprocessing import Pool, Value
+from pathlib import Path
+
+from loguru import logger
+from nndet.io.load import save_json
+
+from nndet.io.prepare import maybe_split_4d_nifti, create_test_split
+
+from nndet.io import get_case_ids_from_dir, load_json, save_yaml
+from nndet.utils.check import env_guard
+
+
+def process_case(case_id,
+                 source_images,
+                 source_labels,
+                 target_images,
+                 target_labels,
+                 ):
+    logger.info(f"Processing case {case_id}")
+    maybe_split_4d_nifti(source_images / f"{case_id}.nii.gz", target_images)
+    shutil.copy2(source_labels / f"{case_id}.nii.gz", target_labels)
+
+
+@env_guard
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('tasks', type=str, nargs='+',
+                        help="One or multiple of: Task003_Liver, Task007_Pancreas, "
+                        "Task008_HepaticVessel, Task010_Colon",
+                        )
+    args = parser.parse_args()
+    tasks = args.tasks
+
+    decathlon_props = {
+        "Task003_Liver": {
+            "seg2det_stuff": [1, ],  # liver
+            "seg2det_things": [2, ],  # cancer
+            "min_size": 3.,
+            "labels": {"0": "cancer"},
+            "labels_stuff": {"1": "liver"},
+        },
+        "Task007_Pancreas": {
+            "seg2det_stuff": [1, ],  # pancreas
+            "seg2det_things": [2, ],
+            "min_size": 3.,
+            "labels": {"0": "cancer"},
+            "labels_stuff": {"1": "pancreas"},
+        },
+        "Task008_HepaticVessel": {
+            "seg2det_stuff": [1, ],  # vessel
+            "seg2det_things": [2, ],
+            "min_size": 3.,
+            "labels": {"0": "tumour"},
+            "labels_stuff": {"1": "vessel"},
+        },
+        "Task010_Colon": {
+            "seg2det_stuff": [],
+            "seg2det_things": [1, ],
+            "min_size": 3.,
+            "labels": {"0": "cancer"},
+            "labels_stuff": {},
+        },
+    }
+
+    basedir = Path(os.getenv('det_data'))
+    for task in tasks:
+        task_data_dir = basedir / task
+
+        logger.remove()
+        logger.add(sys.stdout, level="INFO")
+        logger.add(task_data_dir / "prepare.log", level="DEBUG")
+        logger.info(f"Preparing task: {task}")
+
+        source_raw_dir = task_data_dir / "raw"
+        source_data_dir = source_raw_dir / "imagesTr"
+        source_labels_dir = source_raw_dir / "labelsTr"
+        splitted_dir = task_data_dir / "raw_splitted"
+
+        if not source_data_dir.is_dir():
+            raise ValueError(f"Exptected training images at {source_data_dir}")
+        if not source_labels_dir.is_dir():
+            raise ValueError(f"Exptected training labels at {source_labels_dir}")
+        if not (p := source_raw_dir / "dataset.json").is_file():
+            raise ValueError(f"Expected dataset json to be located at {p}")
+
+        target_data_dir = splitted_dir / "imagesTr"
+        target_label_dir = splitted_dir / "labelsTr"
+        target_data_dir.mkdir(parents=True, exist_ok=True)
+        target_label_dir.mkdir(parents=True, exist_ok=True)
+
+        # preapre meta
+        original_meta = load_json(source_raw_dir / "dataset.json")
+
+        dataset_info = {
+            "task": task,
+            "name": original_meta["name"],
+            
+            "target_class": None,
+            "test_labels": True,
+
+            "modalities": original_meta["modality"],
+            "dim": 3,
+            "info": {
+                "original_labels": original_meta["labels"],
+                "original_numTraining": original_meta["numTraining"],
+            },
+        }
+        dataset_info.update(decathlon_props[task])
+        save_json(dataset_info, task_data_dir / "dataset.json")
+
+        # prepare data and labels
+        case_ids = get_case_ids_from_dir(source_data_dir, remove_modality=False)
+        case_ids = sorted([c for c in case_ids if c])
+        logger.info(f"Found {len(case_ids)} for preparation.")
+
+        for cid in case_ids:
+            process_case(cid,
+                         source_data_dir,
+                         source_labels_dir,
+                         target_data_dir,
+                         target_label_dir,
+                         )
+
+        # with Pool(processes=6) as p:
+        #     p.starmap(process_case, zip(case_ids,
+        #                                 repeat(source_images),
+        #                                 repeat(source_labels),
+        #                                 repeat(target_images),
+        #                                 repeat(target_labels),
+        #                                 ))
+
+        # create an artificial test split
+        create_test_split(splitted_dir=splitted_dir,
+                          num_modalities=1,
+                          test_size=0.3,
+                          random_state=0,
+                          shuffle=True,
+                          )
+
+
+if __name__ == '__main__':
+    main()
--- a/projects/Task011_Kits/README.md
+++ b/projects/Task011_Kits/README.md
@@ -10,6 +10,6 @@ Please read the information from the homepage carefully and follow the rules and
 1. Follow the instructions and usage policies to download the data and place all the folders which contain the data and labels for each case into `Task011_Kits / raw`
 2. Run `python prepare.py` in `projects / Task011_Kits / scripts` of the nnDetection repository.
 3. Run `nndet_seg2det 011` to convert the semantic segmentation labels to instance segmentations.
-4. Run ... to download and replace the manually corrected labels.
+4. Run ... to download and replace the manually corrected labels. # TODO

 The data is now converted to the correct format and the instructions from the nnDetection README can be used to train the networks.
--- a/projects/Task012_LIDC/README.md
+++ b/projects/Task012_LIDC/README.md
+# LIDC
+**Disclaimer**: We are not the host of the data.
+Please make sure to read the requirements and usage policies of the data and **give credit to the authors of the dataset**!
+
+Please read the information from the homepage carefully and follow the rules and instructions provided by the original authors when using the data.
+- Homepage: https://wiki.cancerimagingarchive.net/display/Public/LIDC-IDRI
+
+## Setup MIC LIDC Data preprocessing
+0. Follow https://github.com/MIC-DKFZ/LIDC-IDRI-processing to convert the LIDC data into a simpler format. 
+1. Follow the installation instructions of nnDetection and create a data directory name `Task012_LIDC`.
+2. Place the `data_nrrd` folder and `characteristics.csv` into `Task012_LIDC / raw`
+3. Run `python prepare_mic.py` in `projects / Task012_LIDC / scripts` of the nnDetection repository.
+
+The data is now converted to the correct format and the instructions from the nnDetection README can be used to train the networks.
+
+## Setup PyLIDC
+**Coming Soon**
--- a/projects/Task012_LIDC/scripts/prepare_mic.py
+++ b/projects/Task012_LIDC/scripts/prepare_mic.py
+import sys
+import os
+from itertools import repeat
+from multiprocessing.pool import Pool
+
+import pandas as pd
+import numpy as np
+import numpy.testing as npt
+import SimpleITK as sitk
+from pathlib import Path
+from loguru import logger
+from tqdm import tqdm
+from pathlib import Path
+
+from nndet.io.load import save_json, load_json
+from nndet.io.paths import subfiles
+from nndet.utils.check import env_guard
+
+
+def prepare_case(case_dir: Path, target_dir: Path, df: pd.DataFrame):
+    target_data_dir = target_dir / "imagesTr"
+    target_label_dir = target_dir / "labelsTr"
+
+    case_id = str(case_dir).split('/')[-1]
+    logger.info(f"Processing case {case_id}")
+    df = df[df.PatientID == case_id]
+
+    # process data
+    img = sitk.ReadImage(str(case_dir / f"{case_id}_ct_scan.nrrd"))
+    sitk.WriteImage(img, str(target_data_dir / f"{case_id}.nii.gz"))
+    img_arr = sitk.GetArrayFromImage(img)
+
+    # process mask
+    final_rois = np.zeros_like(img_arr, dtype=np.uint8)
+    mal_labels = {}
+    roi_ids = set([ii.split('.')[0].split('_')[-1]
+                   for ii in os.listdir(case_dir) if '.nii.gz' in ii])
+
+    rix = 1
+    for rid in roi_ids:
+        roi_id_paths = [ii for ii in os.listdir(case_dir) if '{}.nii'.format(rid) in ii]
+        nodule_ids = [ii.split('_')[2].lstrip("0") for ii in roi_id_paths]
+        rater_labels = [df[df.NoduleID == int(ii)].Malignancy.values[0] for ii in nodule_ids]
+        rater_labels.extend([0] * (4-len(rater_labels)))
+        mal_label = np.mean([ii for ii in rater_labels if ii > -1])
+
+        roi_rater_list = []
+        for rp in roi_id_paths:
+            roi = sitk.ReadImage(str(case_dir / rp))
+            roi_arr = sitk.GetArrayFromImage(roi).astype(np.uint8)
+            assert roi_arr.shape == img_arr.shape, [
+                roi_arr.shape, img_arr.shape, case_id, roi.GetSpacing()]
+            for ix in range(len(img_arr.shape)):
+                npt.assert_almost_equal(roi.GetSpacing()[ix], img.GetSpacing()[ix])
+            roi_rater_list.append(roi_arr)
+
+        roi_rater_list.extend([np.zeros_like(roi_rater_list[-1])]*(4-len(roi_id_paths)))
+        roi_raters = np.array(roi_rater_list)
+        roi_raters = np.mean(roi_raters, axis=0)
+        roi_raters[roi_raters < 0.5] = 0
+        if np.sum(roi_raters) > 0:
+            mal_labels[rix] = mal_label
+            final_rois[roi_raters >= 0.5] = rix
+            rix += 1
+        else:
+            # indicate rois suppressed by majority voting of raters
+            logger.warning(f'suppressed roi! {roi_id_paths}')
+
+    mask_itk = sitk.GetImageFromArray(final_rois)
+    sitk.WriteImage(mask_itk, str(target_label_dir / f"{case_id}.nii.gz"))
+    instance_classes = {key: int(item >= 3) for key, item in mal_labels}
+    save_json({"instances": instance_classes, "scores": mal_labels},
+              target_label_dir / f"{case_id}")
+
+
+def reformat_labels(target: Path):
+    for p in subfiles(target, identifier="*json", join=True):
+        label = load_json(Path(p))
+        mal_labels = label["scores"]
+        instance_classes = {key: int(item >= 3) for key, item in mal_labels.items()}
+        save_json({"instances": instance_classes, "scores": mal_labels}, Path(p))
+
+
+def delete_without_label(target: Path):
+    for p in subfiles(target, identifier="*.npz", join=True):
+        _p = str(p).rsplit('.', 1)[0] + '.pkl'
+        if not os.path.isfile(_p):
+            os.remove(p)
+
+
+def check_data_load(target: Path):
+    for p in tqdm(subfiles(target, identifier="*.npy", join=True)):
+        try:
+            data = np.load(p)
+        except Exception as e:
+            print(f"Failed to load: {p} with {e}")
+
+
+@env_guard
+def main():
+    det_data_dir = Path(os.getenv('det_data'))
+    task_data_dir = det_data_dir / "Task012_LIDC"
+    source_data_dir = task_data_dir / "raw"
+    
+    if not (p := source_data_dir / "data_nrrd").is_dir():
+        raise ValueError(f"Expted {p} to contain LIDC data")
+    if not (p := source_data_dir / 'characteristics.csv').is_file():
+        raise ValueError(f"Expted {p} to contain exist")
+
+    target_dir = task_data_dir / "raw_splitted"
+    target_data_dir = task_data_dir / "raw_splitted" / "imagesTr"
+    target_data_dir.mkdir(exist_ok=True, parents=True)
+    target_label_dir = task_data_dir / "raw_splitted" / "labelsTr"
+    target_label_dir.mkdir(exist_ok=True, parents=True)
+
+    logger.remove()
+    logger.add(sys.stdout, level="INFO")
+    logger.add(task_data_dir / "prepare.log", level="DEBUG")
+
+    data_dir = source_data_dir / "data_nrrd"
+    case_dirs = [x for x in data_dir.iterdir() if x.is_dir()]
+    df = pd.read_csv(source_data_dir / 'characteristics.csv', sep=';')
+
+    for cd in case_dirs:
+        prepare_case(cd, target_dir, df)
+
+    # TODO download custom split file
+
+
+if __name__ == '__main__':
+    main()
--- a/projects/Task016_Luna/README.md
+++ b/projects/Task016_Luna/README.md
+# Luna16
+**Disclaimer**: We are not the host of the data.
+Please make sure to read the requirements and usage policies of the data and **give credit to the authors of the dataset**!
+
+Please read the information from the homepage carefully and follow the rules and instructions provided by the original authors when using the data.
+- Homepage: https://luna16.grand-challenge.org/Home/
+
+## Setup
+0. Follow the installation instructions of nnDetection and create a data directory name `Task016_Luna`.
+1. Follow the instructions and usage policies to download the data and place all the subsets into `Task016_Luna / raw`
+2. Run `python prepare.py` in `projects / Task016_Luna / scripts` of the nnDetection repository.
+
+The data is now converted to the correct format and the instructions from the nnDetection README can be used to train the networks.
+
+Notes:
+- since Luna is a 10 Fold cross validation, all 10 folds need to be run
+- all runs should be run with the `--sweep` option and consolidation should be performed via the `--no_model -c copy` since we are not planning to predict a separate test set.
+
+## Evaluation
+1. Run `python prepare_eval_cpm.py [model_name]` to convert the predictions to the Luna format.
+Note: The script needs access to the raw_splitted images.
+2. Download and run the luna evaluation script.
--- a/projects/Task016_Luna/scripts/prepare.py
+++ b/projects/Task016_Luna/scripts/prepare.py
+import os
+import sys
+import traceback
+from collections import defaultdict
+from itertools import repeat
+from multiprocessing.pool import Pool
+from SimpleITK.SimpleITK import ValuedRegionalMaxima
+
+import pandas as pd
+import SimpleITK as sitk
+from pathlib import Path
+
+from nndet.io.prepare import create_test_split
+from loguru import logger
+
+from nndet.io.itk import create_circle_mask_itk
+from nndet.io.load import save_pickle, save_json, save_yaml, load_json
+from nndet.utils.check import env_guard
+
+
+def create_masks(source: Path, target: Path, df: pd.DataFrame):
+    files = []
+    split = {}
+    for i in range(10):
+        subset_dir = source / f"subset{i}"
+        if not subset_dir.is_dir():
+            logger.error(f"{subset_dir} is not s valid subset directory!")
+            continue
+
+        tmp = list((subset_dir.glob('*.mhd')))
+        files.extend(tmp)
+        for t in tmp:
+            split[t.stem.replace('.', '_')] = i
+    save_json(split, target.parent.parent / "splits.json")
+
+    centers = []
+    rads = []
+    for f in files:
+        c = []
+        r = []
+        try:
+            series_df = df.loc[{f.name.rsplit('.', 1)[0]}]
+        except KeyError:
+            pass
+        else:
+            for _, row in series_df.iterrows():
+                c.append((float(row['coordX']), float(row['coordY']), float(row['coordZ'])))
+                r.append(float(row['diameter_mm']) / 2)
+        centers.append(c)
+        rads.append(r)
+
+    assert len(files) == len(centers) == len(rads)
+    with Pool(processes=6) as p:
+        p.starmap(_create_mask, zip(files, repeat(target), centers, rads))
+    # for t in zip(files, repeat(target), centers, rads):
+    #     _create_mask(*t)
+
+
+def _create_mask(source, target, centers, rads):
+    try:
+        logger.info(f"Processing {source.stem}")
+        data = sitk.ReadImage(str(source))
+        mask = create_circle_mask_itk(data, centers, rads, ndim=3)
+        sitk.WriteImage(mask, str(target / f"{source.stem.replace('.', '_')}.nii.gz"))
+        save_json({"instances": {str(k + 1): 0 for k in range(len(centers))}},
+                  target / f"{source.stem.replace('.', '_')}.json")
+    except Exception as e:
+        logger.error(f"Case {source.stem} failed with {e} and {traceback.format_exc()}")
+
+
+def create_splits(source, target):
+    files = []
+    for p in source.glob('subset*'):
+        path = Path(p)
+        if not p.is_dir():
+            continue
+        _files = [str(i).rsplit('.', 1)[0] for i in path.iterdir() if i.suffix == ".mhd"]
+        files.append(_files)
+    splits = []
+    for i in range(len(files)):
+        train_ids = list(range(len(files)))
+        test = files[i]
+        train_ids.pop(i)
+        val = files[(i + 1) % len(files)]
+        train_ids.pop((i + 1) % len(files))
+        assert len(train_ids) == len(files) - 2
+        train = [tr for tri in train_ids for tr in files[tri]]
+        splits.append({"train": train, "val": val, "test": test})
+    save_pickle(splits, target)
+
+
+def convert_data(source: Path, target: Path):
+    for subset_dir in source.glob('subset*'):
+        subset_dir = Path(subset_dir)
+        if not subset_dir.is_dir():
+            continue
+
+        with Pool(processes=6) as p:
+            p.starmap(_convert_data, zip(subset_dir.glob('*.mhd'), repeat(target)))
+
+
+def _convert_data(f, target):
+    logger.info(f"Converting {f}")
+    try:
+        data = sitk.ReadImage(str(f))
+        sitk.WriteImage(data, str(target / f"{f.stem.replace('.', '_')}_0000.nii.gz"))
+    except Exception as e:
+        logger.error(f"Case {f} failed with {e} and {traceback.format_exc()}")
+
+
+@env_guard
+def main():
+    det_data_dir = Path(os.getenv('det_data'))
+    task_data_dir = det_data_dir / "Task016_Luna"
+    source_data_dir = task_data_dir / "raw"
+
+    if not source_data_dir.is_dir():
+        raise RuntimeError(f"{source_data_dir} should contain the raw data but does not exist.")
+    for i in range(10):
+        if not (p := source_data_dir / f"subset{i}"):
+            raise ValueError(f"Expected {p} to contain Luna data")
+    if not (p := source_data_dir / "annotations.csv").is_file():
+        raise ValueError(f"Exptected {p} to exist.")
+
+    target_data_dir = task_data_dir / "raw_splitted" / "imagesTr"
+    target_data_dir.mkdir(exist_ok=True, parents=True)
+    target_label_dir = task_data_dir / "raw_splitted" / "labelsTr"
+    target_label_dir.mkdir(exist_ok=True, parents=True)
+    target_preprocessed_dir = task_data_dir / "preprocessed"
+    target_preprocessed_dir.mkdir(exist_ok=True)
+
+    logger.remove()
+    logger.add(sys.stdout, level="INFO")
+    logger.add(task_data_dir / "prepare.log", level="DEBUG")
+
+    meta = {
+        "name": "Luna",
+        "task": "Task016_Luna",
+
+        "target_class": None,
+        "test_labels": False,
+        
+        "labels": {
+            "0": "lesion",
+        },
+        "modalities": {
+            "0": "CT",
+        },
+        "dim": 3,
+    }
+    save_json(meta, task_data_dir / "dataset.json")
+
+    # prepare data and labels
+    csv = source_data_dir / "annotations.csv"
+    convert_data(source_data_dir, target_data_dir)
+
+    df = pd.read_csv(csv, index_col='seriesuid')
+    create_masks(source_data_dir, target_label_dir, df)
+
+    # generate split
+    logger.info("Generating luna splits... ")
+    saved_original_splits = load_json(task_data_dir / "splits.json")
+    logger.info(f"Found {len(list(saved_original_splits.keys()))} ids in splits.json")
+    original_fold_ids = defaultdict(list)
+    for cid, fid in saved_original_splits.items():
+        original_fold_ids[fid].append(cid)
+
+    splits = []
+    for test_fold in range(10):
+        all_folds = list(range(10))
+        all_folds.pop(test_fold)
+
+        train_ids = []
+        for af in all_folds:
+            train_ids.extend(original_fold_ids[af])
+        splits.append({
+            "train": train_ids,
+            "val": original_fold_ids[test_fold],
+        })
+    save_pickle(splits, target_preprocessed_dir / "splits_final.pkl")
+    save_json(splits, target_preprocessed_dir / "splits_final.json")
+
+if __name__ == '__main__':
+    main()
--- a/projects/Task016_Luna/scripts/prepare_eval_cpm.py
+++ b/projects/Task016_Luna/scripts/prepare_eval_cpm.py
+import argparse
+import os
+import sys
+from pathlib import Path
+from collections import defaultdict
+
+import pandas as pd
+from loguru import logger
+from tqdm import tqdm
+
+from nndet.io.itk import load_sitk
+from nndet.io.load import load_pickle
+from nndet.core.boxes.ops_np import box_center_np
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('model', type=str, help="Name of model")
+    args = parser.parse_args()
+    model = args.model
+
+    task_dir = Path(os.getenv("det_models")) / "Task016_Luna"
+    model_dir = task_dir / model
+    assert model_dir.is_dir()
+
+    raw_splitted_images = Path(os.getenv("det_data")) / "Task016_Luna" / "raw_splitted" / "imagesTr"
+
+    prediction_dir = model_dir / "consolidated" / "val_predictions"
+    assert prediction_dir.is_dir()
+
+    logger.remove()
+    logger.add(sys.stdout, level="INFO")
+    log_file = model_dir / "prepare_eval_cpm.log"
+
+    prediction_cache = defaultdict(list)
+    prediction_paths = sorted([p for p in prediction_dir.iterdir() if p.is_file() and p.name.endswith("_boxes.pkl")])
+    logger.info(f"Found {len(prediction_paths)} predictions for evaluation")
+    for prediction_path in tqdm(prediction_paths):
+        seriusuid = prediction_path.stem.rsplit("_", 1)[0].replace('_', ".")
+        predictions = load_pickle(prediction_path)
+
+        data_path = raw_splitted_images / f"{prediction_path.stem.rsplit('_', 1)[0]}_0000.nii.gz"
+        image_itk = load_sitk(data_path)
+
+        boxes = predictions["pred_boxes"]
+        probs = predictions["pred_scores"]
+        centers = box_center_np(boxes)
+        assert predictions["restore"]
+
+        for center, prob in zip(centers, probs):
+            position_image = (float(center[2]), float(center[1]), float(center[0]))
+            position_world = image_itk.TransformContinuousIndexToPhysicalPoint(position_image)
+
+            prediction_cache["seriesuid"].append(seriusuid)
+            prediction_cache["coordX"].append(float(position_world[0]))
+            prediction_cache["coordY"].append(float(position_world[1]))
+            prediction_cache["coordZ"].append(float(position_world[2]))
+            prediction_cache["probability"].append(float(prob))
+
+    df = pd.DataFrame(prediction_cache)
+    df.to_csv(model_dir / f"{model}.csv")