Commit 7246044d authored by mibaumgartner's avatar mibaumgartner
Browse files

Merge remote-tracking branch 'origin/master' into main

parents fcec502f 6f4c3333
import sys
import os
from itertools import repeat
from multiprocessing.pool import Pool
import pandas as pd
import numpy as np
import numpy.testing as npt
import SimpleITK as sitk
from pathlib import Path
from loguru import logger
from tqdm import tqdm
from pathlib import Path
from nndet.io.load import save_json, load_json
from nndet.io.paths import subfiles
from nndet.utils.check import env_guard
from nndet.utils.info import maybe_verbose_iterable
def prepare_case(case_dir: Path, target_dir: Path, df: pd.DataFrame):
target_data_dir = target_dir / "imagesTr"
target_label_dir = target_dir / "labelsTr"
case_id = str(case_dir).split('/')[-1]
logger.info(f"Processing case {case_id}")
df = df[df.PatientID == case_id]
# process data
img = sitk.ReadImage(str(case_dir / f"{case_id}_ct_scan.nrrd"))
sitk.WriteImage(img, str(target_data_dir / f"{case_id}.nii.gz"))
img_arr = sitk.GetArrayFromImage(img)
# process mask
final_rois = np.zeros_like(img_arr, dtype=np.uint8)
mal_labels = {}
roi_ids = set([ii.split('.')[0].split('_')[-1]
for ii in os.listdir(case_dir) if '.nii.gz' in ii])
rix = 1
for rid in roi_ids:
roi_id_paths = [ii for ii in os.listdir(case_dir) if '{}.nii'.format(rid) in ii]
nodule_ids = [ii.split('_')[2].lstrip("0") for ii in roi_id_paths]
rater_labels = [df[df.NoduleID == int(ii)].Malignancy.values[0] for ii in nodule_ids]
rater_labels.extend([0] * (4-len(rater_labels)))
mal_label = np.mean([ii for ii in rater_labels if ii > -1])
roi_rater_list = []
for rp in roi_id_paths:
roi = sitk.ReadImage(str(case_dir / rp))
roi_arr = sitk.GetArrayFromImage(roi).astype(np.uint8)
assert roi_arr.shape == img_arr.shape, [
roi_arr.shape, img_arr.shape, case_id, roi.GetSpacing()]
for ix in range(len(img_arr.shape)):
npt.assert_almost_equal(roi.GetSpacing()[ix], img.GetSpacing()[ix])
roi_rater_list.append(roi_arr)
roi_rater_list.extend([np.zeros_like(roi_rater_list[-1])]*(4-len(roi_id_paths)))
roi_raters = np.array(roi_rater_list)
roi_raters = np.mean(roi_raters, axis=0)
roi_raters[roi_raters < 0.5] = 0
if np.sum(roi_raters) > 0:
mal_labels[rix] = mal_label
final_rois[roi_raters >= 0.5] = rix
rix += 1
else:
# indicate rois suppressed by majority voting of raters
logger.warning(f'suppressed roi! {roi_id_paths}')
mask_itk = sitk.GetImageFromArray(final_rois)
sitk.WriteImage(mask_itk, str(target_label_dir / f"{case_id}.nii.gz"))
instance_classes = {key: int(item >= 3) for key, item in mal_labels}
save_json({"instances": instance_classes, "scores": mal_labels},
target_label_dir / f"{case_id}")
def reformat_labels(target: Path):
for p in subfiles(target, identifier="*json", join=True):
label = load_json(Path(p))
mal_labels = label["scores"]
instance_classes = {key: int(item >= 3) for key, item in mal_labels.items()}
save_json({"instances": instance_classes, "scores": mal_labels}, Path(p))
def delete_without_label(target: Path):
for p in subfiles(target, identifier="*.npz", join=True):
_p = str(p).rsplit('.', 1)[0] + '.pkl'
if not os.path.isfile(_p):
os.remove(p)
def check_data_load(target: Path):
for p in tqdm(subfiles(target, identifier="*.npy", join=True)):
try:
data = np.load(p)
except Exception as e:
print(f"Failed to load: {p} with {e}")
@env_guard
def main():
det_data_dir = Path(os.getenv('det_data'))
task_data_dir = det_data_dir / "Task012_LIDC"
source_data_dir = task_data_dir / "raw"
if not (p := source_data_dir / "data_nrrd").is_dir():
raise ValueError(f"Expted {p} to contain LIDC data")
if not (p := source_data_dir / 'characteristics.csv').is_file():
raise ValueError(f"Expted {p} to contain exist")
target_dir = task_data_dir / "raw_splitted"
target_data_dir = task_data_dir / "raw_splitted" / "imagesTr"
target_data_dir.mkdir(exist_ok=True, parents=True)
target_label_dir = task_data_dir / "raw_splitted" / "labelsTr"
target_label_dir.mkdir(exist_ok=True, parents=True)
logger.remove()
logger.add(sys.stdout, level="INFO")
logger.add(task_data_dir / "prepare.log", level="DEBUG")
data_dir = source_data_dir / "data_nrrd"
case_dirs = [x for x in data_dir.iterdir() if x.is_dir()]
df = pd.read_csv(source_data_dir / 'characteristics.csv', sep=';')
for cd in maybe_verbose_iterable(case_dirs):
prepare_case(cd, target_dir, df)
# TODO download custom split file
if __name__ == '__main__':
main()
# Luna16
**Disclaimer**: We are not the host of the data.
Please make sure to read the requirements and usage policies of the data and **give credit to the authors of the dataset**!
Please read the information from the homepage carefully and follow the rules and instructions provided by the original authors when using the data.
- Homepage: https://luna16.grand-challenge.org/Home/
## Setup
0. Follow the installation instructions of nnDetection and create a data directory name `Task016_Luna`.
1. Follow the instructions and usage policies to download the data and place all the subsets into `Task016_Luna / raw`
2. Run `python prepare.py` in `projects / Task016_Luna / scripts` of the nnDetection repository.
The data is now converted to the correct format and the instructions from the nnDetection README can be used to train the networks.
Notes:
- since Luna is a 10 Fold cross validation, all 10 folds need to be run
- all runs should be run with the `--sweep` option and consolidation should be performed via the `--no_model -c copy` since we are not planning to predict a separate test set.
## Evaluation
1. Run `python prepare_eval_cpm.py [model_name]` to convert the predictions to the Luna format.
Note: The script needs access to the raw_splitted images.
2. Download and run the luna evaluation script.
import argparse
import os
import sys
import traceback
from collections import defaultdict
from itertools import repeat
from multiprocessing.pool import Pool
from SimpleITK.SimpleITK import ValuedRegionalMaxima
import pandas as pd
import SimpleITK as sitk
from pathlib import Path
from nndet.io.prepare import create_test_split
from loguru import logger
from nndet.io.itk import create_circle_mask_itk
from nndet.io.load import save_pickle, save_json, save_yaml, load_json
from nndet.utils.check import env_guard
def create_masks(source: Path, target: Path, df: pd.DataFrame, num_processes: int):
files = []
split = {}
for i in range(10):
subset_dir = source / f"subset{i}"
if not subset_dir.is_dir():
logger.error(f"{subset_dir} is not s valid subset directory!")
continue
tmp = list((subset_dir.glob('*.mhd')))
files.extend(tmp)
for t in tmp:
split[t.stem.replace('.', '_')] = i
save_json(split, target.parent.parent / "splits.json")
centers = []
rads = []
for f in files:
c = []
r = []
try:
series_df = df.loc[{f.name.rsplit('.', 1)[0]}]
except KeyError:
pass
else:
for _, row in series_df.iterrows():
c.append((float(row['coordX']), float(row['coordY']), float(row['coordZ'])))
r.append(float(row['diameter_mm']) / 2)
centers.append(c)
rads.append(r)
assert len(files) == len(centers) == len(rads)
with Pool(processes=num_processes) as p:
p.starmap(_create_mask, zip(files, repeat(target), centers, rads))
# for t in zip(files, repeat(target), centers, rads):
# _create_mask(*t)
def _create_mask(source, target, centers, rads):
try:
logger.info(f"Processing {source.stem}")
data = sitk.ReadImage(str(source))
mask = create_circle_mask_itk(data, centers, rads, ndim=3)
sitk.WriteImage(mask, str(target / f"{source.stem.replace('.', '_')}.nii.gz"))
save_json({"instances": {str(k + 1): 0 for k in range(len(centers))}},
target / f"{source.stem.replace('.', '_')}.json")
except Exception as e:
logger.error(f"Case {source.stem} failed with {e} and {traceback.format_exc()}")
def create_splits(source, target):
files = []
for p in source.glob('subset*'):
path = Path(p)
if not p.is_dir():
continue
_files = [str(i).rsplit('.', 1)[0] for i in path.iterdir() if i.suffix == ".mhd"]
files.append(_files)
splits = []
for i in range(len(files)):
train_ids = list(range(len(files)))
test = files[i]
train_ids.pop(i)
val = files[(i + 1) % len(files)]
train_ids.pop((i + 1) % len(files))
assert len(train_ids) == len(files) - 2
train = [tr for tri in train_ids for tr in files[tri]]
splits.append({"train": train, "val": val, "test": test})
save_pickle(splits, target)
def convert_data(source: Path, target: Path, num_processes: int):
for subset_dir in source.glob('subset*'):
subset_dir = Path(subset_dir)
if not subset_dir.is_dir():
continue
with Pool(processes=num_processes) as p:
p.starmap(_convert_data, zip(subset_dir.glob('*.mhd'), repeat(target)))
def _convert_data(f, target):
logger.info(f"Converting {f}")
try:
data = sitk.ReadImage(str(f))
sitk.WriteImage(data, str(target / f"{f.stem.replace('.', '_')}_0000.nii.gz"))
except Exception as e:
logger.error(f"Case {f} failed with {e} and {traceback.format_exc()}")
@env_guard
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--num_processes', type=int, default=4, required=False,
help="Number of processes to use for preparation.")
args = parser.parse_args()
num_processes = args.num_processes
det_data_dir = Path(os.getenv('det_data'))
task_data_dir = det_data_dir / "Task016_Luna"
source_data_dir = task_data_dir / "raw"
if not source_data_dir.is_dir():
raise RuntimeError(f"{source_data_dir} should contain the raw data but does not exist.")
for i in range(10):
if not (p := source_data_dir / f"subset{i}"):
raise ValueError(f"Expected {p} to contain Luna data")
if not (p := source_data_dir / "annotations.csv").is_file():
raise ValueError(f"Exptected {p} to exist.")
target_data_dir = task_data_dir / "raw_splitted" / "imagesTr"
target_data_dir.mkdir(exist_ok=True, parents=True)
target_label_dir = task_data_dir / "raw_splitted" / "labelsTr"
target_label_dir.mkdir(exist_ok=True, parents=True)
target_preprocessed_dir = task_data_dir / "preprocessed"
target_preprocessed_dir.mkdir(exist_ok=True)
logger.remove()
logger.add(sys.stdout, level="INFO")
logger.add(task_data_dir / "prepare.log", level="DEBUG")
meta = {
"name": "Luna",
"task": "Task016_Luna",
"target_class": None,
"test_labels": False,
"labels": {
"0": "lesion",
},
"modalities": {
"0": "CT",
},
"dim": 3,
}
save_json(meta, task_data_dir / "dataset.json")
# prepare data and labels
csv = source_data_dir / "annotations.csv"
convert_data(source_data_dir, target_data_dir, num_processes=num_processes)
df = pd.read_csv(csv, index_col='seriesuid')
create_masks(source_data_dir, target_label_dir, df, num_processes=num_processes)
# generate split
logger.info("Generating luna splits... ")
saved_original_splits = load_json(task_data_dir / "splits.json")
logger.info(f"Found {len(list(saved_original_splits.keys()))} ids in splits.json")
original_fold_ids = defaultdict(list)
for cid, fid in saved_original_splits.items():
original_fold_ids[fid].append(cid)
splits = []
for test_fold in range(10):
all_folds = list(range(10))
all_folds.pop(test_fold)
train_ids = []
for af in all_folds:
train_ids.extend(original_fold_ids[af])
splits.append({
"train": train_ids,
"val": original_fold_ids[test_fold],
})
save_pickle(splits, target_preprocessed_dir / "splits_final.pkl")
save_json(splits, target_preprocessed_dir / "splits_final.json")
if __name__ == '__main__':
main()
import argparse
import os
import sys
from pathlib import Path
from collections import defaultdict
import pandas as pd
from loguru import logger
from tqdm import tqdm
from nndet.io.itk import load_sitk
from nndet.io.load import load_pickle
from nndet.core.boxes.ops_np import box_center_np
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('model', type=str, help="Name of model")
args = parser.parse_args()
model = args.model
task_dir = Path(os.getenv("det_models")) / "Task016_Luna"
model_dir = task_dir / model
assert model_dir.is_dir()
raw_splitted_images = Path(os.getenv("det_data")) / "Task016_Luna" / "raw_splitted" / "imagesTr"
prediction_dir = model_dir / "consolidated" / "val_predictions"
assert prediction_dir.is_dir()
logger.remove()
logger.add(sys.stdout, level="INFO")
log_file = model_dir / "prepare_eval_cpm.log"
prediction_cache = defaultdict(list)
prediction_paths = sorted([p for p in prediction_dir.iterdir() if p.is_file() and p.name.endswith("_boxes.pkl")])
logger.info(f"Found {len(prediction_paths)} predictions for evaluation")
for prediction_path in tqdm(prediction_paths):
seriusuid = prediction_path.stem.rsplit("_", 1)[0].replace('_', ".")
predictions = load_pickle(prediction_path)
data_path = raw_splitted_images / f"{prediction_path.stem.rsplit('_', 1)[0]}_0000.nii.gz"
image_itk = load_sitk(data_path)
boxes = predictions["pred_boxes"]
probs = predictions["pred_scores"]
centers = box_center_np(boxes)
assert predictions["restore"]
for center, prob in zip(centers, probs):
position_image = (float(center[2]), float(center[1]), float(center[0]))
position_world = image_itk.TransformContinuousIndexToPhysicalPoint(position_image)
prediction_cache["seriesuid"].append(seriusuid)
prediction_cache["coordX"].append(float(position_world[0]))
prediction_cache["coordY"].append(float(position_world[1]))
prediction_cache["coordZ"].append(float(position_world[2]))
prediction_cache["probability"].append(float(prob))
df = pd.DataFrame(prediction_cache)
df.to_csv(model_dir / f"{model}.csv")
# CADA
**Disclaimer**: We are not the host of the data.
Please make sure to read the requirements and usage policies of the data and **give credit to the authors of the dataset**!
Please read the information from the homepage carefully and follow the rules and instructions provided by the original authors when using the data.
- Homepage: https://cada.grand-challenge.org/Introduction/
- Subtask: Task 1 aneurysm detection
## Setup
0. Follow the installation instructions of nnDetection and create a data directory name `Task017_CADA`.
1. Follow the instructions and usage policies to download the data and place the data and labels at the following locations: data -> `Task017_CADA / raw / train_dataset` and labels -> `Task017_CADA / raw / train_mask_images`
2. Run `python prepare.py` in `projects / Task017_CADA / scripts` of the nnDetection repository.
The data is now converted to the correct format and the instructions from the nnDetection README can be used to train the networks.
import os
import shutil
from pathlib import Path
import SimpleITK as sitk
from nndet.io import save_json
from nndet.utils.check import env_guard
from nndet.utils.info import maybe_verbose_iterable
def run_prep(source_data: Path, source_label: Path,
target_data_dir, target_label_dir: Path):
case_id = f"{(source_data.stem).rsplit('_', 1)[0]}"
shutil.copy(source_data, target_data_dir / f"{case_id}_0000.nii.gz")
shutil.copy(source_label, target_label_dir / f"{case_id}.nii.gz") # rename label file to match data
label_itk = sitk.ReadImage(str(source_label))
label_np = sitk.GetArrayFromImage(label_itk)
instances = {int(_id + 1): 0 for _id in range(label_np.max())}
save_json({"instances": instances}, target_label_dir / f"{case_id}")
@env_guard
def main():
det_data_dir = Path(os.getenv('det_data'))
task_data_dir = det_data_dir / "Task017_CADA"
# setup raw paths
source_data_dir = task_data_dir / "raw" / "train_dataset"
if not source_data_dir.is_dir():
raise RuntimeError(f"{source_data_dir} should contain the raw data but does not exist.")
source_label_dir = task_data_dir / "raw" / "train_mask_images"
if not source_label_dir.is_dir():
raise RuntimeError(f"{source_label_dir} should contain the raw labels but does not exist.")
# setup raw splitted dirs
target_data_dir = task_data_dir / "raw_splitted" / "imagesTr"
target_data_dir.mkdir(exist_ok=True, parents=True)
target_label_dir = task_data_dir / "raw_splitted" / "labelsTr"
target_label_dir.mkdir(exist_ok=True, parents=True)
# prepare dataset info
meta = {
"name": "CADA",
"task": "Task017_CADA",
"target_class": None,
"test_labels": False,
"labels": {"0": "aneurysm"},
"modalities": {"0": "CT"},
"dim": 3,
}
save_json(meta, task_data_dir / "dataset.json")
# prepare data & label
case_ids = [(p.stem).rsplit('_', 1)[0] for p in source_data_dir.glob("*.nii.gz")]
print(f"Found {len(case_ids)} case ids")
for cid in maybe_verbose_iterable(case_ids):
run_prep(
source_data=source_data_dir / f"{cid}_orig.nii.gz",
source_label=source_label_dir / f"{cid}_labeledMasks.nii.gz",
target_data_dir=target_data_dir,
target_label_dir=target_label_dir,
)
if __name__ == "__main__":
main()
# ADAM
**Disclaimer**: We are not the host of the data.
Please make sure to read the requirements and usage policies of the data and **give credit to the authors of the dataset**!
Please read the information from the homepage carefully and follow the rules and instructions provided by the original authors when using the data.
- Homepage: http://adam.isi.uu.nl/
- Subtask: Task 1
## Setup
0. Follow the installation instructions of nnDetection and create a data directory name `Task019FG_ADAM`. We added FG to the ID to indicate that unruptered and ruptured aneurysms are treated as one i.e. we are running a foreground vs background detection without distinguishing the classes.
1. Follow the instructions and usage policies to download the data and place the data into `Task019FG_ADAM / raw / ADAM_release_subjs`
2. Run `python prepare.py` in `projects / Task019_ADAM / scripts` of the nnDetection repository.
3. Run `python split.py` in `projects / Task019_ADAM / scripts` of the nnDetection repository.
4. [Info]: The provided instructions will automatically create a patient stratified random split. We used a random split for our challenge submission. By renaming the provided split file in the `preprocessed` folders, nnDetection will automatically create a random split.
The data is now converted to the correct format and the instructions from the nnDetection README can be used to train the networks.
import os
import shutil
from pathlib import Path
from nndet.io import save_json
from nndet.io.prepare import instances_from_segmentation
from nndet.utils.check import env_guard
from nndet.utils.info import maybe_verbose_iterable
def run_prep_fg_v_bg(
case_id: str,
source_data: Path,
target_data_dir,
target_label_dir: Path,
struct="pre/struct_aligned.nii.gz", # bias field corrected and aligned
tof="pre/TOF.nii.gz", # tof image
):
struct_path = source_data / case_id / struct
tof_path = source_data / case_id / tof
mask_path = source_data / case_id / "aneurysms.nii.gz"
shutil.copy(struct_path, target_data_dir / f"{case_id}_0000.nii.gz")
shutil.copy(tof_path, target_data_dir / f"{case_id}_0001.nii.gz")
instances_from_segmentation(mask_path,
target_label_dir,
fg_vs_bg=True,
file_name=f"{case_id}",
)
@env_guard
def main():
det_data_dir = Path(os.getenv('det_data'))
task_data_dir = det_data_dir / "Task019FG_ADAM"
# setup raw paths
source_data_dir = task_data_dir / "raw" / "ADAM_release_subjs"
if not source_data_dir.is_dir():
raise RuntimeError(f"{source_data_dir} should contain the raw data but does not exist.")
# setup raw splitted dirs
target_data_dir = task_data_dir / "raw_splitted" / "imagesTr"
target_data_dir.mkdir(exist_ok=True, parents=True)
target_label_dir = task_data_dir / "raw_splitted" / "labelsTr"
target_label_dir.mkdir(exist_ok=True, parents=True)
# prepare dataset info
meta = {
"name": "ADAM",
"task": "Task019FG_ADAM",
"target_class": None,
"test_labels": False,
"labels": {"0": "Aneurysm"}, # since we are running FG vs BG this is not completely correct
"modalities": {"0": "Structured", "1": "TOF"},
"dim": 3,
}
save_json(meta, task_data_dir / "dataset.json")
# prepare data
case_ids = [p.stem for p in source_data_dir.iterdir() if p.is_dir()]
print(f"Found {len(case_ids)} case ids")
for cid in maybe_verbose_iterable(case_ids):
run_prep_fg_v_bg(
case_id=cid,
source_data=source_data_dir,
target_data_dir=target_data_dir,
target_label_dir=target_label_dir,
)
if __name__ == "__main__":
main()
import os
from collections import OrderedDict
from pathlib import Path
import numpy as np
from sklearn.model_selection import GroupKFold
from nndet.utils.check import env_guard
from nndet.io import get_case_ids_from_dir, save_pickle
@env_guard
def main():
det_data_dir = Path(os.getenv('det_data'))
task_data_dir = det_data_dir / "Task019FG_ADAM"
target_label_dir = task_data_dir / "raw_splitted" / "labelsTr"
splits_file_dir = task_data_dir / "preprocessed"
splits_file_dir.mkdir(parents=True, exist_ok=True)
splits_file = splits_file_dir / "splits_final.pkl"
case_ids = sorted(get_case_ids_from_dir(target_label_dir, remove_modality=False))
case_ids_pat = [c if c.isdigit() else c[:-1] for c in case_ids]
case_ids_pat_unique = list(set(case_ids_pat))
print(f"Found {len(case_ids_pat_unique)} unique patient ids.")
splits = []
kfold = GroupKFold(n_splits=5)
for i, (train_idx, test_idx) in enumerate(kfold.split(case_ids, groups=case_ids_pat)):
train_keys = np.array(case_ids)[train_idx]
test_keys = np.array(case_ids)[test_idx]
splits.append(OrderedDict())
splits[-1]['train'] = train_keys
splits[-1]['val'] = test_keys
print(f"Generated split: {splits[-1]}")
save_pickle(splits, splits_file)
if __name__ == '__main__':
main()
# RibFrac
**Disclaimer**: We are not the host of the data.
Please make sure to read the requirements and usage policies of the data and **give credit to the authors of the dataset**!
Please read the information from the homepage carefully and follow the rules and instructions provided by the original authors when using the data.
- Homepage: https://ribfrac.grand-challenge.org/
- Subtask: Task 1
## Setup
0. Follow the installation instructions of nnDetection and create a data directory name `Task020FG_RibFrac`. We added FG to the ID to indicate that we don't distinguish the different classes. (even if you prepare the data set with classes, the data needs to be placed inside that directory)
1. Follow the instructions and usage policies to download the data and copy the data/labels/csv files to the following locations:
data -> `Task020FG_RibFrac / raw / imagesTr`; labels -> `Task020FG_RibFrac / raw / labelsTr`; csv files -> `Task020FG_RibFrac / raw`
2. Run `python prepare.py` in `projects / Task020FG_RibFrac / scripts` of the nnDetection repository.
Note: If no manual split is created, nnDetection will create a random 5Fold split which we used for results.
The data is now converted to the correct format and the instructions from the nnDetection README can be used to train the networks.
import os
import shutil
from pathlib import Path
import pandas as pd
from nndet.io import save_json
from nndet.utils.check import env_guard
from nndet.utils.info import maybe_verbose_iterable
def create(
image_source: Path,
label_source: Path,
image_target_dir: Path,
label_target_dir: Path,
df: pd.DataFrame,
fg_only: bool = False,
):
image_target_dir.mkdir(parents=True, exist_ok=True)
label_target_dir.mkdir(parents=True, exist_ok=True)
case_id = image_source.stem.rsplit('-', 1)[0]
case_id_check = label_source.stem.rsplit('-', 1)[0]
assert case_id == case_id_check, f"case ids not matching, found image {case_id} and label {case_id_check}"
df_case = df.loc[df['public_id'] == case_id]
instances = {}
for row in df_case.itertuples():
_cls = int(row.label_code)
if _cls == 0: # background has label code 0 and lab id 0
continue
if fg_only:
_cls = 1
elif _cls == -1:
_cls = 5
instances[str(row.label_id)] = _cls - 1 # class range from 0 - 4 // if fg only 0
assert 0 < _cls < 6, f"Something strange happened {_cls}"
save_json({"instances": instances}, label_target_dir / f"{case_id}.json")
shutil.copy2(image_source, image_target_dir / f"{case_id}_0000.nii.gz")
shutil.copy2(label_source, label_target_dir / f"{case_id}.nii.gz")
@env_guard
def main():
det_data_dir = Path(os.getenv('det_data'))
task_data_dir = det_data_dir / "Task020_RibFrac"
source_data_dir = task_data_dir / "raw"
if not source_data_dir.is_dir():
raise RuntimeError(f"{source_data_dir} should contain the raw data but does not exist.")
if not (p := source_data_dir / "imagesTr").is_dir():
raise ValueError(f"Expected data to be located at {p}")
if not (p := source_data_dir / "labelsTr").is_dir():
raise ValueError(f"Expected labels to be located at {p}")
if not (p := source_data_dir / "ribfrac-train-info-1.csv").is_file():
raise ValueError(f"Expected {p} to exist.")
if not (p := source_data_dir / "ribfrac-train-info-2.csv").is_file():
raise ValueError(f"Expected {p} to exist.")
if not (p := source_data_dir / "ribfrac-val-info.csv").is_file():
raise ValueError(f"Expected {p} to exist.")
target_data_dir = task_data_dir / "raw_splitted" / "imagesTr"
target_data_dir.mkdir(exist_ok=True, parents=True)
target_label_dir = task_data_dir / "raw_splitted" / "labelsTr"
target_label_dir.mkdir(exist_ok=True, parents=True)
csv_fies = [source_data_dir / "ribfrac-train-info-1.csv",
source_data_dir / "ribfrac-train-info-2.csv",
source_data_dir / "ribfrac-val-info.csv"]
df = pd.concat([pd.read_csv(f) for f in csv_fies])
image_paths = list((source_data_dir / "imagesTr").glob("*.nii.gz"))
image_paths.sort()
label_paths = list((source_data_dir / "labelsTr").glob("*.nii.gz"))
label_paths.sort()
print(f"Found {len(image_paths)} data files and {len(label_paths)} label files.")
assert len(image_paths) == len(label_paths)
meta = {
"name": "RibFracFG",
"task": "Task020FG_RibFrac",
"target_class": None,
"test_labels": False,
"labels": {"0": "fracture"}, # since we are running FG vs BG this is not completely correct
"modalities": {"0": "CT"},
"dim": 3,
}
save_json(meta, task_data_dir / "dataset.json")
for ip, lp in maybe_verbose_iterable(list(zip(image_paths, label_paths))):
create(image_source=ip,
label_source=lp,
image_target_dir=target_data_dir,
label_target_dir=target_label_dir,
df=df,
fg_only=True,
)
if __name__ == '__main__':
main()
# ProstateX
**Disclaimer**: We are not the host of the data.
Please make sure to read the requirements and usage policies of the data and **give credit to the authors of the dataset**!
Please read the information from the homepage carefully and follow the rules and instructions provided by the original authors when using the data.
- Data: https://wiki.cancerimagingarchive.net/display/Public/SPIE-AAPM-NCI+PROSTATEx+Challenges
- Masks: https://github.com/rcuocolo/PROSTATEx_masks
## Setup
0. Follow the installation instructions of nnDetection and create a data directory name `Task021_ProstateX`.
1. Download the data and labels and place them in the following structure:
```text
{det_data}
Task021_ProstateX
raw
ktrains
ProstateX
ProstateX-TrainingLesionInformationv2
rcuocolo-PROSTATEx_masks-e344452
```
We used the masks from the git hash e3444521e70cd5e8d405f4e9a6bc08312df8afe7 for our experiments.
For training only the T2 masks and T2,ADC and bVal high were used for training (no KTrains).
If you intend to use the Ktrains sequence, simply add it to the `dataset.json` file, the data is already prepared by the script.
2. Run `python prepare.py` in `projects / Task021_ProstateX / scripts` of the nnDetection repository.
The data is now converted to the correct format and the instructions from the nnDetection README can be used to train the networks.
Note: Since ProstateX only contains a fairly small number of clinically significant lesions and we used a 30% test split, we observed a fairly high variance in the performance of our runs.
import os
import sys
import traceback
from itertools import repeat
from multiprocessing import Pool
from pathlib import Path
import pandas as pd
import SimpleITK as sitk
from nndet.io.prepare import create_test_split
from loguru import logger
from nndet.utils.check import env_guard
from nndet.io import save_json, save_yaml
from nndet.io.itk import load_sitk, load_sitk_as_array, copy_meta_data_itk
from nndet.utils.info import maybe_verbose_iterable
def load_dicom_series_sitk(p):
reader = sitk.ImageSeriesReader()
dicom_names = reader.GetGDCMSeriesFileNames(str(p))
reader.SetFileNames(dicom_names)
return reader.Execute()
def prepare_case(case_id,
data_dirs,
ktrans_dirs,
t2_masks,
df_labels,
df_masks,
data_target,
label_target,
):
try:
logger.info(f"Preparing {case_id}")
tmp_dir = data_dirs / case_id
_dirs = [f for f in tmp_dir.iterdir() if f.is_dir()]
assert len(_dirs) == 1
data_dir = tmp_dir / _dirs[0]
df_mask_case = df_masks[df_masks['T2'].str.contains(case_id)]
assert len(df_mask_case) == 1
t2_mask_file = df_mask_case.iloc[0]["T2"]
assert f"{case_id}" in t2_mask_file
t2_series_id = int(t2_mask_file.rsplit(".", 2)[0].rsplit('_', 1)[1])
adc_mask_file = df_mask_case.iloc[0]["ADC"]
assert f"{case_id}" in adc_mask_file
if case_id == "ProstateX-0025":
# case 0025 has a 7a inside the table
adc_series_id = 7
assert adc_mask_file.endswith("7a.nii.gz")
elif case_id == "ProstateX-0113":
# even though the table shows 9 as the series
# ID we use 10 because 9 is not an ADC file?
adc_series_id = int(adc_mask_file.rsplit(".", 2)[0].rsplit('_', 1)[1])
assert adc_series_id == 9
adc_series_id = 10
else:
adc_series_id = int(adc_mask_file.rsplit(".", 2)[0].rsplit('_', 1)[1])
# T2
t2_dir = [f for f in data_dir.glob("*t2*") if f.name.startswith(f"{t2_series_id}.")]
assert len(t2_dir) == 1
t2_data_itk = load_dicom_series_sitk(t2_dir[0])
# ADC
adc_dir = [f for f in data_dir.glob("*ADC*") if f.name.startswith(f"{adc_series_id}.")]
assert len(adc_dir) == 1
adc_data_itk = load_dicom_series_sitk(adc_dir[0])
# PD-W
pdw_dir = sorted(data_dir.glob("* PD *"))[-1]
pdw_data_itk = load_dicom_series_sitk(pdw_dir)
# k-trans
ktrans_dir = ktrans_dirs / case_id
ktrans_data_itk = load_sitk(ktrans_dir / f"{case_id}-Ktrans.mhd")
# resample data to t2 (only early fusion is currently supported)
resampler = sitk.ResampleImageFilter() # default linear
resampler.SetReferenceImage(t2_data_itk)
adc_data_itk_res = resampler.Execute(adc_data_itk)
pdw_data_itk_res = resampler.Execute(pdw_data_itk)
ktrans_data_itk_res = resampler.Execute(ktrans_data_itk)
# prepare mask
mask_paths = list(t2_masks.glob(f"{case_id}*"))
fids = [int([l for l in mp.name.split("-") if "Finding" in l][0][7:]) for mp in mask_paths]
mask_itk = load_sitk(str(mask_paths[0]))
mask = sitk.GetArrayFromImage(mask_itk)
mask[mask > 0] = 1
for idx, mp in enumerate(mask_paths[1:], start=2):
_mask = load_sitk_as_array(str(mp))[0]
mask[_mask > 0] = idx
mask_final = sitk.GetImageFromArray(mask)
copy_meta_data_itk(t2_data_itk, mask_final)
df_case = df_labels.loc[df_labels['ProxID'] == case_id]
instances = {}
for row in df_case.itertuples():
if row.fid in fids:
instances[fids.index(int(row.fid)) + 1] = int(row.ClinSig)
else:
logger.info(f"Found removed fid {row.fid} in {case_id}")
# save
sitk.WriteImage(t2_data_itk, str(data_target / f"{case_id}_0000.nii.gz"))
sitk.WriteImage(adc_data_itk_res, str(data_target / f"{case_id}_0001.nii.gz"))
sitk.WriteImage(pdw_data_itk_res, str(data_target / f"{case_id}_0002.nii.gz"))
sitk.WriteImage(ktrans_data_itk_res, str(data_target / f"{case_id}_0003.nii.gz"))
sitk.WriteImage(mask_final, str(label_target / f"{case_id}.nii.gz"))
save_json({"instances": instances}, label_target / f"{case_id}.json")
except Exception as e:
logger.error(f"Case {case_id} failed with {e} and {traceback.format_exc()}")
@env_guard
def main():
"""
Does not use the KTrans Sequence of ProstateX
This script only uses the provided T2 masks
"""
det_data_dir = Path(os.getenv('det_data'))
task_data_dir = det_data_dir / "Task021_ProstateX"
# setup raw paths
source_data_dir = task_data_dir / "raw"
if not source_data_dir.is_dir():
raise RuntimeError(f"{source_data_dir} should contain the raw data but does not exist.")
source_data = source_data_dir / "PROSTATEx"
source_masks = source_data_dir / "rcuocolo-PROSTATEx_masks-e344452"
source_ktrans = source_data_dir / "ktrains"
csv_labels = source_data_dir / "ProstateX-TrainingLesionInformationv2" / "ProstateX-Findings-Train.csv"
csv_masks = source_data_dir / "rcuocolo-PROSTATEx_masks-e344452" / "Files" / "Image_list.csv"
data_target = task_data_dir / "raw_splitted" / "imagesTr"
data_target.mkdir(parents=True, exist_ok=True)
label_target = task_data_dir / "raw_splitted" / "labelsTr"
label_target.mkdir(parents=True, exist_ok=True)
logger.remove()
logger.add(sys.stdout, format="{level} {message}", level="INFO")
logger.add(data_target.parent.parent / "prepare.log", level="DEBUG")
base_masks = source_masks / "Files" / "Masks"
t2_masks = base_masks / "T2"
df_labels = pd.read_csv(csv_labels)
df_masks = pd.read_csv(csv_masks)
case_ids = [f.stem.split("-", 2)[:2] for f in t2_masks.glob("*nii.gz")]
case_ids = list(set([f"{c[0]}-{c[1]}" for c in case_ids]))
logger.info(f"Found {len(case_ids)} cases")
# save meta
logger.info("Saving dataset info")
dataset_info = {
"name": "ProstateX",
"task": "Task021_ProstateX",
"target_class": None,
"test_labels": False,
"labels": {
"0": "clinically_significant",
"1": "clinically_insignificant",
},
"modalities": {
"0": "T2",
"1": "ADC",
"2": "PD-W",
"3": "Ktrans"
},
"dim": 3,
"info": "Ground Truth: T2 Masks; \n"
"Modalities: T2, ADC, PD-W, Ktrans \n;"
"Classes: clinically significant = 1, insignificant = 0 \n"
"Keep: ProstateX-0025 '10-28-2011-MR prostaat kanker detectie WDSmc MCAPRODETW-19047'\n"
"Masks\n"
"https://github.com/rcuocolo/PROSTATEx_masks\n"
"Github hash: e3444521e70cd5e8d405f4e9a6bc08312df8afe7"
}
save_json(dataset_info, task_data_dir / "dataset.json")
# prepare labels and data
for cid in maybe_verbose_iterable(case_ids):
prepare_case(cid,
data_dirs=source_data,
ktrans_dirs=source_ktrans,
t2_masks=t2_masks,
df_labels=df_labels,
df_masks=df_masks,
data_target=data_target,
label_target=label_target,
)
# with Pool(processes=6) as p:
# p.starmap(prepare_case, zip(case_ids,
# repeat(source_data),
# repeat(source_ktrans),
# repeat(t2_masks),
# repeat(df_labels),
# repeat(df_masks),
# repeat(data_target),
# repeat(label_target),
# ))
# create test split
create_test_split(task_data_dir / "raw_splitted",
num_modalities=len(dataset_info["modalities"]),
test_size=0.3,
random_state=0,
shuffle=True,
)
if __name__ == '__main__':
main()
# LymphNodes
**Disclaimer**: We are not the host of the data.
Please make sure to read the requirements and usage policies of the data and **give credit to the authors of the dataset**!
Please read the information from the homepage carefully and follow the rules and instructions provided by the original authors when using the data.
- Homepage: https://wiki.cancerimagingarchive.net/display/Public/CT+Lymph+Nodes
- Masks: we used the masks provided by the same page
## Setup
0. Follow the installation instructions of nnDetection and create a data directory name `Task025_LymphNodes`.
1. Down the data and labels and place the data into `Task025_LymphNodes / raw / CT Lymph Nodes` and the labels into `Task025_LymphNodes / raw / MED_ABD_LYMPH_MASKS`
2. Run `python prepare.py` in `projects / Task025_LymphNodes / scripts` of the nnDetection repository.
The data is now converted to the correct format and the instructions from the nnDetection README can be used to train the networks.
import os
import shutil
import sys
from itertools import repeat
from multiprocessing import Pool
from pathlib import Path
from nndet.utils.check import env_guard
import numpy as np
from loguru import logger
import SimpleITK as sitk
from nndet.io import save_json
from nndet.io.prepare import create_test_split
from nndet.io.itk import load_sitk_as_array
from nndet.utils.info import maybe_verbose_iterable
def prepare_image(
case_id: str,
base_dir: Path,
mask_dir: Path,
raw_splitted_dir: Path,
):
logger.info(f"Processing {case_id}")
root_data_dir = base_dir / case_id
patient_data_dir = []
for root, dirs, files in os.walk(root_data_dir, topdown=False):
if any([f.endswith(".dcm") for f in files]):
patient_data_dir.append(Path(root))
assert len(patient_data_dir) == 1
patient_data_dir = patient_data_dir[0]
reader = sitk.ImageSeriesReader()
dicom_names = reader.GetGDCMSeriesFileNames(str(patient_data_dir))
reader.SetFileNames(dicom_names)
data_itk = reader.Execute()
patient_label_dir = mask_dir / case_id
label_path = [p for p in patient_label_dir.iterdir() if p.is_file() and p.name.endswith(".nii.gz")]
assert len(label_path) == 1
label_path = label_path[0]
mask = load_sitk_as_array(label_path)[0]
instances = np.unique(mask)
instances = instances[instances > 0]
meta = {"instances": {str(int(i)): 0 for i in instances}}
meta["original_path_data"] = str(patient_data_dir)
meta["original_path_label"] = str(label_path)
save_json(meta, raw_splitted_dir / "labelsTr" / f"{case_id}.json")
sitk.WriteImage(data_itk, str(raw_splitted_dir / "imagesTr" / f"{case_id}_0000.nii.gz"))
shutil.copy(label_path, raw_splitted_dir / "labelsTr" / f"{case_id}.nii.gz")
@env_guard
def main():
det_data_dir = Path(os.getenv("det_data"))
task_data_dir = det_data_dir / "Task025_LymphNodes"
source_data_base = task_data_dir / "raw"
if not source_data_base.is_dir():
raise RuntimeError(f"{source_data_base} should contain the raw data but does not exist.")
raw_splitted_dir = task_data_dir / "raw_splitted"
(raw_splitted_dir / "imagesTr").mkdir(parents=True, exist_ok=True)
(raw_splitted_dir / "labelsTr").mkdir(parents=True, exist_ok=True)
(raw_splitted_dir / "imagesTs").mkdir(parents=True, exist_ok=True)
(raw_splitted_dir / "labelsTs").mkdir(parents=True, exist_ok=True)
logger.remove()
logger.add(sys.stdout, format="{level} {message}", level="DEBUG")
logger.add(raw_splitted_dir.parent / "prepare.log", level="DEBUG")
meta = {
"name": "Lymph Node TCIA",
"task": "Task025_LymphNodes",
"target_class": None,
"test_labels": True,
"labels": {
"0": "LymphNode",
},
"modalities": {
"0": "CT",
},
"dim": 3,
}
save_json(meta, raw_splitted_dir.parent / "dataset.json")
base_dir = source_data_base / "CT Lymph Nodes"
mask_dir = source_data_base / "MED_ABD_LYMPH_MASKS"
case_ids = sorted([p.name for p in base_dir.iterdir() if p.is_dir()])
logger.info(f"Found {len(case_ids)} cases in {base_dir}")
for cid in maybe_verbose_iterable(case_ids):
prepare_image(
case_id=cid,
base_dir=base_dir,
mask_dir=mask_dir,
raw_splitted_dir=raw_splitted_dir,
)
# with Pool(processes=6) as p:
# p.starmap(
# prepare_image,
# zip(
# case_ids,
# repeat(base_dir),
# repeat(mask_dir),
# repeat(raw_splitted_dir)
# )
# )
create_test_split(raw_splitted_dir,
num_modalities=len(meta["modalities"]),
test_size=0.3,
random_state=0,
shuffle=True,
)
if __name__ == '__main__':
main()
"""
Copyright 2020 Division of Medical Image Computing, German Cancer Research Center (DKFZ), Heidelberg, Germany
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import argparse
import shutil
import sys
import os
from pathlib import Path
from typing import Sequence
from loguru import logger
from nndet.utils.check import env_guard
from omegaconf import OmegaConf
from nndet.ptmodule import MODULE_REGISTRY
from nndet.inference.sweeper import BoxSweeper
from nndet.inference.loading import get_latest_model
from nndet.inference.ensembler.base import extract_results
from nndet.io import get_task, load_pickle, save_pickle
def consolidate_models(source_dirs: Sequence[Path], target_dir: Path, ckpt: str):
"""
Copy final models from folds into consolidated folder
Args:
source_dirs: directory of each fold to consolidate
target_dir: directory to save models to
ckpt: checkpoint identifier to select models for ensembling
"""
for fold, sd in enumerate(source_dirs):
model_paths = list(sd.glob('*.ckpt'))
found_models = [mp for mp in model_paths if ckpt in str(mp.stem)]
assert len(found_models) == 1, f"Found wrong number of models, {found_models}"
model_path = found_models[0]
assert f"fold{fold}" in str(model_path.parent.stem), f"Expected fold {fold} but found {model_path}"
shutil.copy2(model_path, target_dir / f"model_fold{fold}.ckpt")
def consolidate_predictions(
source_dirs: Sequence[Path],
target_dir: Path,
consolidate: str,
):
"""
Consolidate sweep states to find new postprocessing hyperparameters
Args:
source_dirs: directory of each fold
target_dir: directory of condolidated models
consolidate: consolidation mode
"""
if consolidate == 'export':
logger.info("Consolidating sweep states for refinement.")
postfix = "sweep_predictions"
elif consolidate == 'copy':
logger.info("Consolidating val predictions for evaluation")
postfix = "val_predictions"
else:
raise ValueError(f"Consolidation {consolidate} is not supported")
pred_dir = target_dir / postfix
pred_dir.mkdir(parents=True, exist_ok=True)
for source_dir in source_dirs:
for p in [p for p in (source_dir / postfix).iterdir() if p.is_file()]:
shutil.copy(p, pred_dir)
@env_guard
def main():
parser = argparse.ArgumentParser()
parser.add_argument('task', type=str,
help="Task id e.g. Task12_LIDC OR 12 OR LIDC",
)
parser.add_argument('model', type=str,
help="model name, e.g. RetinaUNetV0",
)
parser.add_argument('-o', '--overwrites', type=str, nargs='+', required=False,
help="overwrites for config file. Only needed in case of box eval",
)
parser.add_argument('-c', '--consolidate', type=str, default="export", required=False,
help=("Determines how to consolidate predictions: 'export' or 'copy'. "
"'copy' will copy the predictions of each fold into the directory for evaluation. "
"'export' will use the updated parameters after consolidation to update the "
"predictions and export them. This is only supported if one of the "
"sweep settings is active! Default: export"),
)
parser.add_argument('--num_folds', type=int, default=5, required=False,
help="Number of folds. Default: 5",
)
parser.add_argument('--no_model', action="store_false",
help="Deactivate if consolidating nnUNet results",
)
parser.add_argument('--sweep_boxes', action="store_true",
help="Sweep for best parameters for bounding box based models",
)
parser.add_argument('--sweep_instances', action="store_true",
help="Sweep for best parameters for instance segmentation based models",
)
parser.add_argument('--ckpt', type=str, default="last", required=False,
help="Define identifier of checkpoint for consolidation. "
"Use this with care!")
args = parser.parse_args()
model = args.model
task = args.task
ov = args.overwrites
consolidate = args.consolidate
num_folds = args.num_folds
do_model_consolidation = args.no_model
sweep_boxes = args.sweep_boxes
sweep_instances = args.sweep_instances
ckpt = args.ckpt
if consolidate == "export" and not (sweep_boxes or sweep_instances):
raise ValueError("Export needs new parameter sweep! Actiate one of the sweep "
"arguments or change to copy mode")
task_dir = Path(os.getenv("det_models")) / get_task(task, name=True, models=True)
model_dir = task_dir / model
if not model_dir.is_dir():
raise ValueError(f"{model_dir} does not exist")
target_dir = model_dir / "consolidated"
logger.remove()
logger.add(sys.stdout, format="{level} {message}", level="INFO")
logger.add(Path(target_dir) / "consolidate.log", level="DEBUG")
logger.info(f"looking for models in {model_dir}")
training_dirs = [get_latest_model(model_dir, fold) for fold in range(num_folds)]
logger.info(f"Found training dirs: {training_dirs}")
# model consolidation
if do_model_consolidation:
logger.info("Consolidate models")
if ckpt != "last":
logger.warning(f"Found ckpt overwrite {ckpt}, this is not the default, "
"this can drastically influence the performance!")
consolidate_models(training_dirs, target_dir, ckpt)
# consolidate predictions
logger.info("Consolidate predictions")
consolidate_predictions(
source_dirs=training_dirs,
target_dir=target_dir,
consolidate=consolidate,
)
shutil.copy2(training_dirs[0] / "plan.pkl", target_dir)
shutil.copy2(training_dirs[0] / "config.yaml", target_dir)
# invoke new parameter sweeps
cfg = OmegaConf.load(str(target_dir / "config.yaml"))
ov = ov if ov is not None else []
ov.append("host.parent_data=${env:det_data}")
ov.append("host.parent_results=${env:det_models}")
if ov is not None:
cfg.merge_with_dotlist(ov)
preprocessed_output_dir = Path(cfg["host"]["preprocessed_output_dir"])
plan = load_pickle(target_dir / "plan.pkl")
gt_dir = preprocessed_output_dir / plan["data_identifier"] / "labelsTr"
if sweep_boxes:
logger.info("Sweeping box predictions")
module = MODULE_REGISTRY[cfg["module"]]
ensembler_cls = module.get_ensembler_cls(
key="boxes", dim=plan["network_dim"]) # TODO: make this configurable
sweeper = BoxSweeper(
classes=[item for _, item in cfg["data"]["labels"].items()],
pred_dir=target_dir / "sweep_predictions",
gt_dir=gt_dir,
target_metric=cfg["trainer_cfg"].get("eval_score_key",
"mAP_IoU_0.10_0.50_0.05_MaxDet_100"),
ensembler_cls=ensembler_cls,
save_dir=target_dir / "sweep",
)
inference_plan = sweeper.run_postprocessing_sweep()
elif sweep_instances:
raise NotImplementedError
plan = load_pickle(target_dir / "plan.pkl")
if consolidate != 'copy':
plan["inference_plan"] = inference_plan
save_pickle(plan, target_dir / "plan_inference.pkl")
for restore in [True, False]:
export_dir = target_dir / "val_predictions" if restore else \
target_dir / "val_predictions_preprocessed"
extract_results(
source_dir=target_dir / "sweep_predictions",
target_dir=export_dir,
ensembler_cls=ensembler_cls,
restore=restore,
**inference_plan,
)
else:
logger.warning("Plan used from fold 0, not updated with consolidation")
save_pickle(plan, target_dir / "plan_inference.pkl")
if __name__ == '__main__':
main()
"""
Copyright 2020 Division of Medical Image Computing, German Cancer Research Center (DKFZ), Heidelberg, Germany
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import argparse
import shutil
import sys
from pathlib import Path
from tokenize import PseudoExtras
from hydra.experimental import initialize_config_module
from loguru import logger
from nndet.io import get_task, load_json, save_json
from nndet.utils.config import compose, load_dataset_info
from nndet.utils.check import env_guard
def convert_raw(task, overwrite, ov):
task_name_full = get_task(task, name=True)
task_num, task_name = task_name_full[4:].split('_', 1)
new_task_name_full = f"Task{task_num}FG_{task_name}"
cfg = compose(task, "config.yaml", overrides=ov if ov is not None else [])
print(cfg.pretty())
source_splitted_dir = Path(cfg["host"]["splitted_4d_output_dir"])
target_splitted_dir = Path(str(source_splitted_dir).replace(task_name_full, new_task_name_full))
if target_splitted_dir.is_dir() and overwrite:
shutil.rmtree(target_splitted_dir)
target_splitted_dir.mkdir(parents=True)
logger.remove()
logger.add(sys.stdout, level="INFO")
logger.add(target_splitted_dir.parent / "convert_cls2fg.log", level="DEBUG")
# update dataset_info
source_data_info = Path(cfg["host"]["data_dir"])
data_info = load_dataset_info(source_data_info)
data_info.pop("labels")
data_info["labels"] = {"0": "fg"}
data_info["task"] = new_task_name_full
save_json(data_info, target_splitted_dir.parent / "dataset.json", indent=4)
for postfix in ["Tr", "Ts"]:
source_image_dir = source_splitted_dir / f"images{postfix}"
source_label_dir = source_splitted_dir / f"labels{postfix}"
if not source_image_dir.is_dir():
logger.info(f"{source_image_dir} is not a dir. Skipping it.")
continue
# copy images and labels
shutil.copytree(source_image_dir, target_splitted_dir / f"images{postfix}")
shutil.copytree(source_label_dir, target_splitted_dir / f"labels{postfix}")
# remap properties file to foreground class
target_label_dir = target_splitted_dir / f"labels{postfix}"
for f in [l for l in target_label_dir.glob("*.json")]:
props = load_json(f)
props["instances"] = {key: 0 for key in props["instances"].keys()}
save_json(props, f)
@env_guard
def main():
"""
Convert raw splitted data with class sensitive annotations into
a new dataset which only distinguishes fg and bg
"""
parser = argparse.ArgumentParser()
parser.add_argument('tasks', type=str, nargs='+',
help="Single or multiple task identifiers to process consecutively",
)
parser.add_argument('--overwrite', action='store_true')
parser.add_argument('-o', '--overwrites', type=str, nargs='+',
help="overwrites for config file",
required=False)
args = parser.parse_args()
tasks = args.tasks
ov = args.overwrites
overwrite = args.overwrite
initialize_config_module(config_module="nndet.conf")
for task in tasks:
convert_raw(task, overwrite, ov)
if __name__ == '__main__':
main()
"""
Copyright 2020 Division of Medical Image Computing, German Cancer Research Center (DKFZ), Heidelberg, Germany
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import argparse
import sys
from datetime import datetime
from itertools import repeat
from multiprocessing import Pool
from pathlib import Path
from typing import Sequence
import numpy as np
import SimpleITK as sitk
from hydra.experimental import initialize_config_module
from loguru import logger
from scipy import ndimage
from scipy.ndimage import label
from tqdm import tqdm
from nndet.core.boxes import box_size_np
from nndet.io import get_case_ids_from_dir, load_json, save_json
from nndet.io.transforms.instances import get_bbox_np
from nndet.io.itk import copy_meta_data_itk, load_sitk, load_sitk_as_array
from nndet.utils.config import compose
def prepare_detection_label(case_id: str,
label_dir: Path,
things_classes: Sequence[int],
stuff_classes: Sequence[int],
min_size: float = 0,
min_vol: float = 0,
):
if (label_dir / f"{case_id}.json").is_file():
logger.info(f"Found existing case {case_id} -> skipping")
return
logger.info(f"Processing {case_id}")
seg_itk = load_sitk(label_dir / f"{case_id}.nii.gz")
spacing = np.asarray(seg_itk.GetSpacing())[::-1]
seg = sitk.GetArrayFromImage(seg_itk)
# prepare stuff information
stuff_seg = np.zeros_like(seg)
if stuff_classes:
for new_class, old_class in enumerate(stuff_classes, start=1):
stuff_seg[seg == old_class] = new_class
stuff_seg_itk = copy_meta_data_itk(seg_itk, sitk.GetImageFromArray(stuff_seg))
sitk.WriteImage(stuff_seg_itk, str(label_dir / f"{case_id}_stuff.nii.gz"))
# prepare things information
structure = np.ones([3] * seg.ndim)
things_seg = np.copy(seg)
things_seg[stuff_seg > 0] = 0 # remove all stuff classes from segmentation
instances_not_filtered, _ = label(things_seg, structure=structure)
final_mapping = {}
if instances_not_filtered.max() > 0:
boxes = get_bbox_np(instances_not_filtered[None])["boxes"]
box_sizes = box_size_np(boxes)
instance_ids = np.unique(instances_not_filtered)
instance_ids = instance_ids[instance_ids > 0]
assert len(instance_ids) == len(boxes)
isotopic_axis = list(range(seg.ndim))
isotopic_axis.pop(np.argmax(spacing))
instances = np.zeros_like(instances_not_filtered)
start_id = 1
for iid, bsize in zip(instance_ids, box_sizes):
bsize_world = bsize * spacing
instance_mask = (instances_not_filtered == iid)
instance_vol = instance_mask.sum()
if all(bsize_world[isotopic_axis] > min_size) and (instance_vol > min_vol):
instances[instance_mask] = start_id
single_idx = np.argwhere(instance_mask)[0]
semantic_class = int(seg[tuple(single_idx)])
final_mapping[start_id] = things_classes.index(semantic_class)
start_id += 1
else:
instances = np.zeros_like(instances_not_filtered)
final_instances_itk = copy_meta_data_itk(seg_itk, sitk.GetImageFromArray(instances))
sitk.WriteImage(final_instances_itk, str(label_dir / f"{case_id}.nii.gz"))
save_json({"instances": final_mapping}, label_dir / f"{case_id}.json")
sitk.WriteImage(seg_itk, str(label_dir / f"{case_id}_orig.nii.gz"))
if __name__ == '__main__':
"""
This script converts a semantic segmentation dataset into an instance
segmentation dataset by using connected components on the labels.
To account for separated pixels inside the annotations only annotations
with a specified minimal size are converted into objects.
The data needs to be in the same format as in nnunet: images
stay the same, labels will be semantic segmentations.
============================================================================
================================IMPORTANT==================================+
============================================================================
Needs additional information from dataset.json/.yaml:
`seg2det_stuff`: these are classes which are interpreted semantically
`seg2det_things`: these are classes which are interpreted as instances
Both entries should be lists with the indices of the respective
classes where the position will determine its new class
e.g.
`seg2det_stuff`: [2,] -> remap class 2 from semantic segmentation
to new stuff class 1 (stuff classes start at one)
`seg2det_things`: [1, 3] -> remap class 1 and 3 from semantic
segmentation to new things classes 0 and 1, respectively
`min_size`: minimum size in mm of objects in the isotropic axis (default 0)
`min_vol`: minimum volume of instances in pixels (default 0)
============================================================================
The segmentation labels will be splitted into things (classes to detect)
and stuff classes (additional segmentation labels) and will be saved
as separate files.
"""
parser = argparse.ArgumentParser()
parser.add_argument('tasks', type=str, nargs='+',
help="Single or multiple task identifiers to process consecutively",
)
parser.add_argument('--overwrite', action='store_true')
parser.add_argument('-o', '--overwrites', type=str, nargs='+',
help="overwrites for config file",
required=False,
)
parser.add_argument('--volume_ranking',
help="Create a ranking of instances based on their volume",
action='store_true',
)
parser.add_argument('--num_processes', type=int, default=4, required=False,
help="Number of processes to use for conversion.")
args = parser.parse_args()
tasks = args.tasks
ov = args.overwrites
overwrite = args.overwrite
do_volume_ranking = args.volume_ranking
num_processes = args.num_processes
initialize_config_module(config_module="nndet.conf")
for task in tasks:
cfg = compose(task, "config.yaml", overrides=ov if ov is not None else [])
print(cfg.pretty())
splitted_dir = Path(cfg["host"]["splitted_4d_output_dir"])
logger.remove()
logger.add(sys.stdout, level="INFO")
logger.add(splitted_dir / "convert_seg2det.log", level="DEBUG")
logger.info(f"+++++ Running covnersion: {datetime.now()} +++++")
logger.info(f"Running min_size {cfg['data'].get('min_size', 0)} and "
f"min_vol {cfg['data'].get('min_vol', 0)}")
for postfix in ["Tr", "Ts"]:
label_dir = splitted_dir / f"labels{postfix}"
case_ids = [f.name[:-7] for f in label_dir.glob("*.nii.gz")]
logger.info(f"Found {len(case_ids)} cases for conversion with postfix {postfix}.")
# for cid in case_ids:
# prepare_detection_label(case_id=cid,
# label_dir=label_dir,
# stuff_classes=cfg["data"]["seg2det_stuff"],
# things_classes=cfg["data"]["seg2det_things"],
# min_size=cfg["data"].get("min_size", 0),
# min_vol=cfg["data"].get("min_vol", 0),
# )
with Pool(processes=num_processes) as p:
p.starmap(prepare_detection_label, zip(
case_ids,
repeat(label_dir),
repeat(cfg["data"]["seg2det_things"]),
repeat(cfg["data"]["seg2det_stuff"]),
repeat(cfg["data"].get("min_size", 0)),
repeat(cfg["data"].get("min_vol", 0)),
))
if do_volume_ranking:
for postfix in ["Tr", "Ts"]:
if (label_dir := splitted_dir / f"labels{postfix}").is_dir():
ranking = []
for case_id in tqdm([f.stem for f in label_dir.glob("*.json")]):
instances = load_sitk_as_array(label_dir / f"{case_id}.nii.gz")[0]
instance_ids, instance_counts = np.unique(instances, return_counts=True)
cps = [np.argwhere(instances == iid)[0].tolist() for iid in instance_ids[1:]]
assert len(instance_ids) - 1 == len(cps)
tmp = [{"case_id": str(case_id), "instance_id": int(iid),
"vol": int(vol), "cp": list(cp)[::-1]}
for iid, vol, cp in zip(instance_ids[1:], instance_counts[1:], cps)]
ranking.extend(tmp)
ranking = sorted(ranking, key=lambda x: x["vol"])
save_json(ranking, splitted_dir / f"volume_ranking_{postfix}.json")
else:
logger.info(f"Did not find dir {label_dir} for volume ranking")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment