integrity_checks.py

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import json
import numpy as np
import SimpleITK as sitk
import nibabel as nib
from multiprocessing import Pool
from .load_image import load_series


def verify_all_same_orientation(folder):
    nii_files = [
        os.path.join(folder, nii_path) for nii_path in os.listdir(folder)
        if os.path.isfile(os.path.join(folder, nii_path)) and nii_path.endswith(
            ".nii.gz")
    ]
    orientations = []
    for n in nii_files:
        img = nib.load(n)
        affine = img.affine
        orientation = nib.aff2axcodes(affine)
        orientations.append(orientation)
    orientations = np.array(orientations)
    unique_orientations = np.unique(orientations, axis=0)
    all_same = len(unique_orientations) == 1
    return all_same, unique_orientations


def verify_same_geometry(img_1: sitk.Image, img_2: sitk.Image):
    ori1, spacing1, direction1, size1 = img_1.GetOrigin(), img_1.GetSpacing(
    ), img_1.GetDirection(), img_1.GetSize()
    ori2, spacing2, direction2, size2 = img_2.GetOrigin(), img_2.GetSpacing(
    ), img_2.GetDirection(), img_2.GetSize()
    return np.all(np.isclose(ori1, ori2)) and np.all(
        np.isclose(spacing1, spacing2)) and np.all(
            np.isclose(direction1, direction2)) and np.all(
                np.isclose(size1, size2))


def verify_contains_only_expected_labels(itk_img: str,
                                         valid_labels: (tuple, list)):
    img_npy, _, _ = load_series(itk_img)
    uniques = np.unique(img_npy)
    invalid_uniques = [i for i in uniques if i not in valid_labels]
    if len(invalid_uniques) == 0:
        r = True
    else:
        r = False
    return r, invalid_uniques


def verify_same_geometry_and_shape_and_nonan(image_paths, label_path):
    # verify that all modalities and the label have the same shape and geometry.
    label_itk = sitk.ReadImage(label_path)
    nans_in_seg = np.any(np.isnan(sitk.GetArrayFromImage(label_itk)))
    assert not nans_in_seg, "There are NAN values in label {}.".format(
        label_path)
    for image_path in image_paths:
        img = sitk.ReadImage(image_path)
        np_img = sitk.GetArrayFromImage(img)
        nans_in_image = np.any(np.isnan(np_img))
        assert not nans_in_image, "There are NAN values in image {}.".format(
            image_path)
        assert verify_same_geometry(img, label_itk), "The geometry of the image {} does not match the geometry of the label {}. The pixel arrays " \
                                "will not be aligned and nnU-Net cannot use this data. Please make sure your image modalities " \
                                    "are coregistered and have the same geometry as the label.".format(image_path, label_path)


def verify_training_dataset(folder,
                            num_modalities,
                            identifiers,
                            expected_labels,
                            train_images_dir="imagesTr",
                            train_labels_dir="labelsTr",
                            default_num_threads=8):
    imagesTr_folder = os.path.join(folder, train_images_dir)
    nii_files_in_imagesTr = [
        nii_path for nii_path in os.listdir(imagesTr_folder)
        if os.path.isfile(os.path.join(imagesTr_folder, nii_path)) and
        nii_path.endswith(".nii.gz")
    ]
    labelsTr_folder = os.path.join(folder, train_labels_dir)
    nii_files_in_labelsTr = [
        nii_path for nii_path in os.listdir(labelsTr_folder)
        if os.path.isfile(os.path.join(labelsTr_folder, nii_path)) and
        nii_path.endswith(".nii.gz")
    ]

    label_files = []
    for c in identifiers:
        # check if all files are present
        expected_label_file = os.path.join(folder, train_labels_dir,
                                           c + ".nii.gz")
        label_files.append(expected_label_file)
        expected_image_files = [
            os.path.join(folder, train_images_dir, c + "_%04.0d.nii.gz" % i)
            for i in range(num_modalities)
        ]
        assert os.path.isfile(
            expected_label_file
        ), "Could not find label file for case {}. Expected file: {}".format(
            c, expected_label_file)
        assert all(
            [os.path.isfile(i) for i in expected_image_files]
        ), "Some image files are missing for case {}. Expected files: {}.".format(
            c, expected_image_files)
        # check that all modalities and the label have the same shape and geometry
        verify_same_geometry_and_shape_and_nonan(expected_image_files,
                                                 expected_label_file)

        for i in expected_image_files:
            nii_files_in_imagesTr.remove(os.path.basename(i))
        nii_files_in_labelsTr.remove(os.path.basename(expected_label_file))

    assert len(
        nii_files_in_imagesTr
    ) == 0, "There are training cases in {} that are not listed in dataset json file.".format(
        train_images_dir)
    assert len(
        nii_files_in_labelsTr
    ) == 0, "There are training cases in {} that are not listed in dataset json file.".format(
        train_labels_dir)

    # check if labels are in consecutive order
    assert expected_labels[
        0] == 0, 'The first label must be 0 and maps to the background'
    labels_valid_consecutive = np.ediff1d(expected_labels) == 1
    assert all(
        labels_valid_consecutive
    ), f'Labels must be in consecutive order (0, 1, 2, ...). The labels {np.array(expected_labels)[1:][~labels_valid_consecutive]} do not satisfy this restriction'

    p = Pool(default_num_threads)
    results = p.starmap(verify_contains_only_expected_labels,
                        zip(label_files, [expected_labels] * len(label_files)))
    p.close()
    p.join()

    for i, r in enumerate(results):
        assert r[
            0], "Unexpected labels found in file {}. Found these unexpected values {}.".format(
                label_files[i], r[1])


def verify_test_dataset(folder,
                        num_modalities,
                        identifiers,
                        test_images_dir="imagesTs"):
    imagesTs_folder = os.path.join(folder, test_images_dir)
    nii_files_in_imagesTs = [
        nii_path for nii_path in os.listdir(imagesTs_folder)
        if os.path.isfile(os.path.join(imagesTs_folder, nii_path)) and
        nii_path.endswith(".nii.gz")
    ]
    for c in identifiers:
        # check if all files are present
        expected_image_files = [
            os.path.join(folder, test_images_dir, c + "_%04.0d.nii.gz" % i)
            for i in range(num_modalities)
        ]
        assert all(
            [os.path.isfile(i) for i in expected_image_files]
        ), "Some image files are missing for case {}. Expected files: {}.".format(
            c, expected_image_files)

        # verify that all modalities have the same geometry. We use the affine for this
        if num_modalities > 1:
            images_itk = [sitk.ReadImage(i) for i in expected_image_files]
            reference_img = images_itk[0]
            for i, img in enumerate(images_itk[1:]):
                assert verify_same_geometry(img, reference_img), "The modalities of the image {} do not seem to be " \
                                                                    "registered. Please coregister your modalities.".foramt(
                                                                        expected_image_files[i])
        for i in expected_image_files:
            nii_files_in_imagesTs.remove(os.path.basename(i))
    assert len(
        nii_files_in_imagesTs
    ) == 0, "There are training cases in {} that are not listed in dataset json file. Their names are listed in {}.".format(
        test_images_dir, nii_files_in_imagesTs)


def verify_dataset_integrity(folder,
                             data_json="dataset.json",
                             train_images_dir="imagesTr",
                             train_labels_dir="labelsTr",
                             test_images_dir="imagesTs",
                             default_num_threads=8):
    assert os.path.isfile(
        os.path.join(folder, data_json)
    ), "There needs to be a {} file in folder {}, but not found.".format(
        data_json, folder)
    assert os.path.isdir(
        os.path.join(folder, train_images_dir)
    ), "There needs to be a {} subfolder in folder {}, but not found.".format(
        train_images_dir, folder)
    assert os.path.isdir(
        os.path.join(folder, train_labels_dir)
    ), "There needs to be a {} subfolder in folder {}, but not found.".format(
        train_labels_dir, folder)

    with open(os.path.join(folder, data_json), 'r') as f:
        dataset = json.load(f)
    training_cases = dataset['training']
    num_modalities = len(dataset['modality'].keys())
    test_cases = dataset['test']
    expected_train_identifiers = [
        i['image'].split("/")[-1].split('.')[0] for i in training_cases
    ]
    expected_test_identifiers = [
        i.split("/")[-1].split('.')[0] for i in test_cases
    ]
    expected_labels = list(int(i) for i in dataset['labels'].keys())

    # check training dataset orientation
    all_same, unique_orientations = verify_all_same_orientation(
        os.path.join(folder, train_images_dir))
    assert all_same, "Not all images in the dataset have the same axis ordering. Please correct that by reorienting the data."

    # check duplicate label
    assert len(expected_train_identifiers) == len(
        np.unique(expected_train_identifiers)
    ), "Found duplicate training labels in {}, please check your dataset.".format(
        data_json)
    verify_training_dataset(
        folder,
        num_modalities,
        expected_train_identifiers,
        expected_labels,
        train_images_dir=train_images_dir,
        train_labels_dir=train_labels_dir,
        default_num_threads=default_num_threads)

    # check test set, but only if there actually is a test set
    if len(expected_test_identifiers) > 0:
        verify_test_dataset(
            folder,
            num_modalities,
            expected_test_identifiers,
            test_images_dir=test_images_dir)