"vscode:/vscode.git/clone" did not exist on "5a58226130fcd35774cd14d8c5ba638a4adb2bf4"
Commit b634945d authored by limm's avatar limm
Browse files

support v0.6

parent 5b3792fc
# Copyright (c) Facebook, Inc. and its affiliates.
import copy
import logging
import numpy as np
from typing import List, Optional, Union
import torch
from detectron2.config import configurable
from . import detection_utils as utils
from . import transforms as T
"""
This file contains the default mapping that's applied to "dataset dicts".
"""
__all__ = ["DatasetMapper"]
class DatasetMapper:
"""
A callable which takes a dataset dict in Detectron2 Dataset format,
and map it into a format used by the model.
This is the default callable to be used to map your dataset dict into training data.
You may need to follow it to implement your own one for customized logic,
such as a different way to read or transform images.
See :doc:`/tutorials/data_loading` for details.
The callable currently does the following:
1. Read the image from "file_name"
2. Applies cropping/geometric transforms to the image and annotations
3. Prepare data and annotations to Tensor and :class:`Instances`
"""
@configurable
def __init__(
self,
is_train: bool,
*,
augmentations: List[Union[T.Augmentation, T.Transform]],
image_format: str,
use_instance_mask: bool = False,
use_keypoint: bool = False,
instance_mask_format: str = "polygon",
keypoint_hflip_indices: Optional[np.ndarray] = None,
precomputed_proposal_topk: Optional[int] = None,
recompute_boxes: bool = False,
):
"""
NOTE: this interface is experimental.
Args:
is_train: whether it's used in training or inference
augmentations: a list of augmentations or deterministic transforms to apply
image_format: an image format supported by :func:`detection_utils.read_image`.
use_instance_mask: whether to process instance segmentation annotations, if available
use_keypoint: whether to process keypoint annotations if available
instance_mask_format: one of "polygon" or "bitmask". Process instance segmentation
masks into this format.
keypoint_hflip_indices: see :func:`detection_utils.create_keypoint_hflip_indices`
precomputed_proposal_topk: if given, will load pre-computed
proposals from dataset_dict and keep the top k proposals for each image.
recompute_boxes: whether to overwrite bounding box annotations
by computing tight bounding boxes from instance mask annotations.
"""
if recompute_boxes:
assert use_instance_mask, "recompute_boxes requires instance masks"
# fmt: off
self.is_train = is_train
self.augmentations = T.AugmentationList(augmentations)
self.image_format = image_format
self.use_instance_mask = use_instance_mask
self.instance_mask_format = instance_mask_format
self.use_keypoint = use_keypoint
self.keypoint_hflip_indices = keypoint_hflip_indices
self.proposal_topk = precomputed_proposal_topk
self.recompute_boxes = recompute_boxes
# fmt: on
logger = logging.getLogger(__name__)
mode = "training" if is_train else "inference"
logger.info(f"[DatasetMapper] Augmentations used in {mode}: {augmentations}")
@classmethod
def from_config(cls, cfg, is_train: bool = True):
augs = utils.build_augmentation(cfg, is_train)
if cfg.INPUT.CROP.ENABLED and is_train:
augs.insert(0, T.RandomCrop(cfg.INPUT.CROP.TYPE, cfg.INPUT.CROP.SIZE))
recompute_boxes = cfg.MODEL.MASK_ON
else:
recompute_boxes = False
ret = {
"is_train": is_train,
"augmentations": augs,
"image_format": cfg.INPUT.FORMAT,
"use_instance_mask": cfg.MODEL.MASK_ON,
"instance_mask_format": cfg.INPUT.MASK_FORMAT,
"use_keypoint": cfg.MODEL.KEYPOINT_ON,
"recompute_boxes": recompute_boxes,
}
if cfg.MODEL.KEYPOINT_ON:
ret["keypoint_hflip_indices"] = utils.create_keypoint_hflip_indices(cfg.DATASETS.TRAIN)
if cfg.MODEL.LOAD_PROPOSALS:
ret["precomputed_proposal_topk"] = (
cfg.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TRAIN
if is_train
else cfg.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TEST
)
return ret
def _transform_annotations(self, dataset_dict, transforms, image_shape):
# USER: Modify this if you want to keep them for some reason.
for anno in dataset_dict["annotations"]:
if not self.use_instance_mask:
anno.pop("segmentation", None)
if not self.use_keypoint:
anno.pop("keypoints", None)
# USER: Implement additional transformations if you have other types of data
annos = [
utils.transform_instance_annotations(
obj, transforms, image_shape, keypoint_hflip_indices=self.keypoint_hflip_indices
)
for obj in dataset_dict.pop("annotations")
if obj.get("iscrowd", 0) == 0
]
instances = utils.annotations_to_instances(
annos, image_shape, mask_format=self.instance_mask_format
)
# After transforms such as cropping are applied, the bounding box may no longer
# tightly bound the object. As an example, imagine a triangle object
# [(0,0), (2,0), (0,2)] cropped by a box [(1,0),(2,2)] (XYXY format). The tight
# bounding box of the cropped triangle should be [(1,0),(2,1)], which is not equal to
# the intersection of original bounding box and the cropping box.
if self.recompute_boxes:
instances.gt_boxes = instances.gt_masks.get_bounding_boxes()
dataset_dict["instances"] = utils.filter_empty_instances(instances)
def __call__(self, dataset_dict):
"""
Args:
dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
Returns:
dict: a format that builtin models in detectron2 accept
"""
dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below
# USER: Write your own image loading if it's not from a file
image = utils.read_image(dataset_dict["file_name"], format=self.image_format)
utils.check_image_size(dataset_dict, image)
# USER: Remove if you don't do semantic/panoptic segmentation.
if "sem_seg_file_name" in dataset_dict:
sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name"), "L").squeeze(2)
else:
sem_seg_gt = None
aug_input = T.AugInput(image, sem_seg=sem_seg_gt)
transforms = self.augmentations(aug_input)
image, sem_seg_gt = aug_input.image, aug_input.sem_seg
image_shape = image.shape[:2] # h, w
# Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
# but not efficient on large generic data structures due to the use of pickle & mp.Queue.
# Therefore it's important to use torch.Tensor.
dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
if sem_seg_gt is not None:
dataset_dict["sem_seg"] = torch.as_tensor(sem_seg_gt.astype("long"))
# USER: Remove if you don't use pre-computed proposals.
# Most users would not need this feature.
if self.proposal_topk is not None:
utils.transform_proposals(
dataset_dict, image_shape, transforms, proposal_topk=self.proposal_topk
)
if not self.is_train:
# USER: Modify this if you want to keep them for some reason.
dataset_dict.pop("annotations", None)
dataset_dict.pop("sem_seg_file_name", None)
return dataset_dict
if "annotations" in dataset_dict:
self._transform_annotations(dataset_dict, transforms, image_shape)
return dataset_dict
### Common Datasets
The dataset implemented here do not need to load the data into the final format.
It should provide the minimal data structure needed to use the dataset, so it can be very efficient.
For example, for an image dataset, just provide the file names and labels, but don't read the images.
Let the downstream decide how to read.
# Copyright (c) Facebook, Inc. and its affiliates.
from .coco import load_coco_json, load_sem_seg, register_coco_instances, convert_to_coco_json
from .coco_panoptic import register_coco_panoptic, register_coco_panoptic_separated
from .lvis import load_lvis_json, register_lvis_instances, get_lvis_instances_meta
from .pascal_voc import load_voc_instances, register_pascal_voc
from . import builtin as _builtin # ensure the builtin datasets are registered
__all__ = [k for k in globals().keys() if not k.startswith("_")]
# -*- coding: utf-8 -*-
# Copyright (c) Facebook, Inc. and its affiliates.
"""
This file registers pre-defined datasets at hard-coded paths, and their metadata.
We hard-code metadata for common datasets. This will enable:
1. Consistency check when loading the datasets
2. Use models on these standard datasets directly and run demos,
without having to download the dataset annotations
We hard-code some paths to the dataset that's assumed to
exist in "./datasets/".
Users SHOULD NOT use this file to create new dataset / metadata for new dataset.
To add new dataset, refer to the tutorial "docs/DATASETS.md".
"""
import os
from detectron2.data import DatasetCatalog, MetadataCatalog
from .builtin_meta import ADE20K_SEM_SEG_CATEGORIES, _get_builtin_metadata
from .cityscapes import load_cityscapes_instances, load_cityscapes_semantic
from .cityscapes_panoptic import register_all_cityscapes_panoptic
from .coco import load_sem_seg, register_coco_instances
from .coco_panoptic import register_coco_panoptic, register_coco_panoptic_separated
from .lvis import get_lvis_instances_meta, register_lvis_instances
from .pascal_voc import register_pascal_voc
# ==== Predefined datasets and splits for COCO ==========
_PREDEFINED_SPLITS_COCO = {}
_PREDEFINED_SPLITS_COCO["coco"] = {
"coco_2014_train": ("coco/train2014", "coco/annotations/instances_train2014.json"),
"coco_2014_val": ("coco/val2014", "coco/annotations/instances_val2014.json"),
"coco_2014_minival": ("coco/val2014", "coco/annotations/instances_minival2014.json"),
"coco_2014_minival_100": ("coco/val2014", "coco/annotations/instances_minival2014_100.json"),
"coco_2014_valminusminival": (
"coco/val2014",
"coco/annotations/instances_valminusminival2014.json",
),
"coco_2017_train": ("coco/train2017", "coco/annotations/instances_train2017.json"),
"coco_2017_val": ("coco/val2017", "coco/annotations/instances_val2017.json"),
"coco_2017_test": ("coco/test2017", "coco/annotations/image_info_test2017.json"),
"coco_2017_test-dev": ("coco/test2017", "coco/annotations/image_info_test-dev2017.json"),
"coco_2017_val_100": ("coco/val2017", "coco/annotations/instances_val2017_100.json"),
}
_PREDEFINED_SPLITS_COCO["coco_person"] = {
"keypoints_coco_2014_train": (
"coco/train2014",
"coco/annotations/person_keypoints_train2014.json",
),
"keypoints_coco_2014_val": ("coco/val2014", "coco/annotations/person_keypoints_val2014.json"),
"keypoints_coco_2014_minival": (
"coco/val2014",
"coco/annotations/person_keypoints_minival2014.json",
),
"keypoints_coco_2014_valminusminival": (
"coco/val2014",
"coco/annotations/person_keypoints_valminusminival2014.json",
),
"keypoints_coco_2014_minival_100": (
"coco/val2014",
"coco/annotations/person_keypoints_minival2014_100.json",
),
"keypoints_coco_2017_train": (
"coco/train2017",
"coco/annotations/person_keypoints_train2017.json",
),
"keypoints_coco_2017_val": ("coco/val2017", "coco/annotations/person_keypoints_val2017.json"),
"keypoints_coco_2017_val_100": (
"coco/val2017",
"coco/annotations/person_keypoints_val2017_100.json",
),
}
_PREDEFINED_SPLITS_COCO_PANOPTIC = {
"coco_2017_train_panoptic": (
# This is the original panoptic annotation directory
"coco/panoptic_train2017",
"coco/annotations/panoptic_train2017.json",
# This directory contains semantic annotations that are
# converted from panoptic annotations.
# It is used by PanopticFPN.
# You can use the script at detectron2/datasets/prepare_panoptic_fpn.py
# to create these directories.
"coco/panoptic_stuff_train2017",
),
"coco_2017_val_panoptic": (
"coco/panoptic_val2017",
"coco/annotations/panoptic_val2017.json",
"coco/panoptic_stuff_val2017",
),
"coco_2017_val_100_panoptic": (
"coco/panoptic_val2017_100",
"coco/annotations/panoptic_val2017_100.json",
"coco/panoptic_stuff_val2017_100",
),
}
def register_all_coco(root):
for dataset_name, splits_per_dataset in _PREDEFINED_SPLITS_COCO.items():
for key, (image_root, json_file) in splits_per_dataset.items():
# Assume pre-defined datasets live in `./datasets`.
register_coco_instances(
key,
_get_builtin_metadata(dataset_name),
os.path.join(root, json_file) if "://" not in json_file else json_file,
os.path.join(root, image_root),
)
for (
prefix,
(panoptic_root, panoptic_json, semantic_root),
) in _PREDEFINED_SPLITS_COCO_PANOPTIC.items():
prefix_instances = prefix[: -len("_panoptic")]
instances_meta = MetadataCatalog.get(prefix_instances)
image_root, instances_json = instances_meta.image_root, instances_meta.json_file
# The "separated" version of COCO panoptic segmentation dataset,
# e.g. used by Panoptic FPN
register_coco_panoptic_separated(
prefix,
_get_builtin_metadata("coco_panoptic_separated"),
image_root,
os.path.join(root, panoptic_root),
os.path.join(root, panoptic_json),
os.path.join(root, semantic_root),
instances_json,
)
# The "standard" version of COCO panoptic segmentation dataset,
# e.g. used by Panoptic-DeepLab
register_coco_panoptic(
prefix,
_get_builtin_metadata("coco_panoptic_standard"),
image_root,
os.path.join(root, panoptic_root),
os.path.join(root, panoptic_json),
instances_json,
)
# ==== Predefined datasets and splits for LVIS ==========
_PREDEFINED_SPLITS_LVIS = {
"lvis_v1": {
"lvis_v1_train": ("coco/", "lvis/lvis_v1_train.json"),
"lvis_v1_val": ("coco/", "lvis/lvis_v1_val.json"),
"lvis_v1_test_dev": ("coco/", "lvis/lvis_v1_image_info_test_dev.json"),
"lvis_v1_test_challenge": ("coco/", "lvis/lvis_v1_image_info_test_challenge.json"),
},
"lvis_v0.5": {
"lvis_v0.5_train": ("coco/", "lvis/lvis_v0.5_train.json"),
"lvis_v0.5_val": ("coco/", "lvis/lvis_v0.5_val.json"),
"lvis_v0.5_val_rand_100": ("coco/", "lvis/lvis_v0.5_val_rand_100.json"),
"lvis_v0.5_test": ("coco/", "lvis/lvis_v0.5_image_info_test.json"),
},
"lvis_v0.5_cocofied": {
"lvis_v0.5_train_cocofied": ("coco/", "lvis/lvis_v0.5_train_cocofied.json"),
"lvis_v0.5_val_cocofied": ("coco/", "lvis/lvis_v0.5_val_cocofied.json"),
},
}
def register_all_lvis(root):
for dataset_name, splits_per_dataset in _PREDEFINED_SPLITS_LVIS.items():
for key, (image_root, json_file) in splits_per_dataset.items():
register_lvis_instances(
key,
get_lvis_instances_meta(dataset_name),
os.path.join(root, json_file) if "://" not in json_file else json_file,
os.path.join(root, image_root),
)
# ==== Predefined splits for raw cityscapes images ===========
_RAW_CITYSCAPES_SPLITS = {
"cityscapes_fine_{task}_train": ("cityscapes/leftImg8bit/train/", "cityscapes/gtFine/train/"),
"cityscapes_fine_{task}_val": ("cityscapes/leftImg8bit/val/", "cityscapes/gtFine/val/"),
"cityscapes_fine_{task}_test": ("cityscapes/leftImg8bit/test/", "cityscapes/gtFine/test/"),
}
def register_all_cityscapes(root):
for key, (image_dir, gt_dir) in _RAW_CITYSCAPES_SPLITS.items():
meta = _get_builtin_metadata("cityscapes")
image_dir = os.path.join(root, image_dir)
gt_dir = os.path.join(root, gt_dir)
inst_key = key.format(task="instance_seg")
DatasetCatalog.register(
inst_key,
lambda x=image_dir, y=gt_dir: load_cityscapes_instances(
x, y, from_json=True, to_polygons=True
),
)
MetadataCatalog.get(inst_key).set(
image_dir=image_dir, gt_dir=gt_dir, evaluator_type="cityscapes_instance", **meta
)
sem_key = key.format(task="sem_seg")
DatasetCatalog.register(
sem_key, lambda x=image_dir, y=gt_dir: load_cityscapes_semantic(x, y)
)
MetadataCatalog.get(sem_key).set(
image_dir=image_dir,
gt_dir=gt_dir,
evaluator_type="cityscapes_sem_seg",
ignore_label=255,
**meta,
)
# ==== Predefined splits for PASCAL VOC ===========
def register_all_pascal_voc(root):
SPLITS = [
("voc_2007_trainval", "VOC2007", "trainval"),
("voc_2007_train", "VOC2007", "train"),
("voc_2007_val", "VOC2007", "val"),
("voc_2007_test", "VOC2007", "test"),
("voc_2012_trainval", "VOC2012", "trainval"),
("voc_2012_train", "VOC2012", "train"),
("voc_2012_val", "VOC2012", "val"),
]
for name, dirname, split in SPLITS:
year = 2007 if "2007" in name else 2012
register_pascal_voc(name, os.path.join(root, dirname), split, year)
MetadataCatalog.get(name).evaluator_type = "pascal_voc"
def register_all_ade20k(root):
root = os.path.join(root, "ADEChallengeData2016")
for name, dirname in [("train", "training"), ("val", "validation")]:
image_dir = os.path.join(root, "images", dirname)
gt_dir = os.path.join(root, "annotations_detectron2", dirname)
name = f"ade20k_sem_seg_{name}"
DatasetCatalog.register(
name, lambda x=image_dir, y=gt_dir: load_sem_seg(y, x, gt_ext="png", image_ext="jpg")
)
MetadataCatalog.get(name).set(
stuff_classes=ADE20K_SEM_SEG_CATEGORIES[:],
image_root=image_dir,
sem_seg_root=gt_dir,
evaluator_type="sem_seg",
ignore_label=255,
)
# True for open source;
# Internally at fb, we register them elsewhere
if __name__.endswith(".builtin"):
# Assume pre-defined datasets live in `./datasets`.
_root = os.getenv("DETECTRON2_DATASETS", "datasets")
register_all_coco(_root)
register_all_lvis(_root)
register_all_cityscapes(_root)
register_all_cityscapes_panoptic(_root)
register_all_pascal_voc(_root)
register_all_ade20k(_root)
# -*- coding: utf-8 -*-
# Copyright (c) Facebook, Inc. and its affiliates.
"""
Note:
For your custom dataset, there is no need to hard-code metadata anywhere in the code.
For example, for COCO-format dataset, metadata will be obtained automatically
when calling `load_coco_json`. For other dataset, metadata may also be obtained in other ways
during loading.
However, we hard-coded metadata for a few common dataset here.
The only goal is to allow users who don't have these dataset to use pre-trained models.
Users don't have to download a COCO json (which contains metadata), in order to visualize a
COCO model (with correct class names and colors).
"""
# All coco categories, together with their nice-looking visualization colors
# It's from https://github.com/cocodataset/panopticapi/blob/master/panoptic_coco_categories.json
COCO_CATEGORIES = [
{"color": [220, 20, 60], "isthing": 1, "id": 1, "name": "person"},
{"color": [119, 11, 32], "isthing": 1, "id": 2, "name": "bicycle"},
{"color": [0, 0, 142], "isthing": 1, "id": 3, "name": "car"},
{"color": [0, 0, 230], "isthing": 1, "id": 4, "name": "motorcycle"},
{"color": [106, 0, 228], "isthing": 1, "id": 5, "name": "airplane"},
{"color": [0, 60, 100], "isthing": 1, "id": 6, "name": "bus"},
{"color": [0, 80, 100], "isthing": 1, "id": 7, "name": "train"},
{"color": [0, 0, 70], "isthing": 1, "id": 8, "name": "truck"},
{"color": [0, 0, 192], "isthing": 1, "id": 9, "name": "boat"},
{"color": [250, 170, 30], "isthing": 1, "id": 10, "name": "traffic light"},
{"color": [100, 170, 30], "isthing": 1, "id": 11, "name": "fire hydrant"},
{"color": [220, 220, 0], "isthing": 1, "id": 13, "name": "stop sign"},
{"color": [175, 116, 175], "isthing": 1, "id": 14, "name": "parking meter"},
{"color": [250, 0, 30], "isthing": 1, "id": 15, "name": "bench"},
{"color": [165, 42, 42], "isthing": 1, "id": 16, "name": "bird"},
{"color": [255, 77, 255], "isthing": 1, "id": 17, "name": "cat"},
{"color": [0, 226, 252], "isthing": 1, "id": 18, "name": "dog"},
{"color": [182, 182, 255], "isthing": 1, "id": 19, "name": "horse"},
{"color": [0, 82, 0], "isthing": 1, "id": 20, "name": "sheep"},
{"color": [120, 166, 157], "isthing": 1, "id": 21, "name": "cow"},
{"color": [110, 76, 0], "isthing": 1, "id": 22, "name": "elephant"},
{"color": [174, 57, 255], "isthing": 1, "id": 23, "name": "bear"},
{"color": [199, 100, 0], "isthing": 1, "id": 24, "name": "zebra"},
{"color": [72, 0, 118], "isthing": 1, "id": 25, "name": "giraffe"},
{"color": [255, 179, 240], "isthing": 1, "id": 27, "name": "backpack"},
{"color": [0, 125, 92], "isthing": 1, "id": 28, "name": "umbrella"},
{"color": [209, 0, 151], "isthing": 1, "id": 31, "name": "handbag"},
{"color": [188, 208, 182], "isthing": 1, "id": 32, "name": "tie"},
{"color": [0, 220, 176], "isthing": 1, "id": 33, "name": "suitcase"},
{"color": [255, 99, 164], "isthing": 1, "id": 34, "name": "frisbee"},
{"color": [92, 0, 73], "isthing": 1, "id": 35, "name": "skis"},
{"color": [133, 129, 255], "isthing": 1, "id": 36, "name": "snowboard"},
{"color": [78, 180, 255], "isthing": 1, "id": 37, "name": "sports ball"},
{"color": [0, 228, 0], "isthing": 1, "id": 38, "name": "kite"},
{"color": [174, 255, 243], "isthing": 1, "id": 39, "name": "baseball bat"},
{"color": [45, 89, 255], "isthing": 1, "id": 40, "name": "baseball glove"},
{"color": [134, 134, 103], "isthing": 1, "id": 41, "name": "skateboard"},
{"color": [145, 148, 174], "isthing": 1, "id": 42, "name": "surfboard"},
{"color": [255, 208, 186], "isthing": 1, "id": 43, "name": "tennis racket"},
{"color": [197, 226, 255], "isthing": 1, "id": 44, "name": "bottle"},
{"color": [171, 134, 1], "isthing": 1, "id": 46, "name": "wine glass"},
{"color": [109, 63, 54], "isthing": 1, "id": 47, "name": "cup"},
{"color": [207, 138, 255], "isthing": 1, "id": 48, "name": "fork"},
{"color": [151, 0, 95], "isthing": 1, "id": 49, "name": "knife"},
{"color": [9, 80, 61], "isthing": 1, "id": 50, "name": "spoon"},
{"color": [84, 105, 51], "isthing": 1, "id": 51, "name": "bowl"},
{"color": [74, 65, 105], "isthing": 1, "id": 52, "name": "banana"},
{"color": [166, 196, 102], "isthing": 1, "id": 53, "name": "apple"},
{"color": [208, 195, 210], "isthing": 1, "id": 54, "name": "sandwich"},
{"color": [255, 109, 65], "isthing": 1, "id": 55, "name": "orange"},
{"color": [0, 143, 149], "isthing": 1, "id": 56, "name": "broccoli"},
{"color": [179, 0, 194], "isthing": 1, "id": 57, "name": "carrot"},
{"color": [209, 99, 106], "isthing": 1, "id": 58, "name": "hot dog"},
{"color": [5, 121, 0], "isthing": 1, "id": 59, "name": "pizza"},
{"color": [227, 255, 205], "isthing": 1, "id": 60, "name": "donut"},
{"color": [147, 186, 208], "isthing": 1, "id": 61, "name": "cake"},
{"color": [153, 69, 1], "isthing": 1, "id": 62, "name": "chair"},
{"color": [3, 95, 161], "isthing": 1, "id": 63, "name": "couch"},
{"color": [163, 255, 0], "isthing": 1, "id": 64, "name": "potted plant"},
{"color": [119, 0, 170], "isthing": 1, "id": 65, "name": "bed"},
{"color": [0, 182, 199], "isthing": 1, "id": 67, "name": "dining table"},
{"color": [0, 165, 120], "isthing": 1, "id": 70, "name": "toilet"},
{"color": [183, 130, 88], "isthing": 1, "id": 72, "name": "tv"},
{"color": [95, 32, 0], "isthing": 1, "id": 73, "name": "laptop"},
{"color": [130, 114, 135], "isthing": 1, "id": 74, "name": "mouse"},
{"color": [110, 129, 133], "isthing": 1, "id": 75, "name": "remote"},
{"color": [166, 74, 118], "isthing": 1, "id": 76, "name": "keyboard"},
{"color": [219, 142, 185], "isthing": 1, "id": 77, "name": "cell phone"},
{"color": [79, 210, 114], "isthing": 1, "id": 78, "name": "microwave"},
{"color": [178, 90, 62], "isthing": 1, "id": 79, "name": "oven"},
{"color": [65, 70, 15], "isthing": 1, "id": 80, "name": "toaster"},
{"color": [127, 167, 115], "isthing": 1, "id": 81, "name": "sink"},
{"color": [59, 105, 106], "isthing": 1, "id": 82, "name": "refrigerator"},
{"color": [142, 108, 45], "isthing": 1, "id": 84, "name": "book"},
{"color": [196, 172, 0], "isthing": 1, "id": 85, "name": "clock"},
{"color": [95, 54, 80], "isthing": 1, "id": 86, "name": "vase"},
{"color": [128, 76, 255], "isthing": 1, "id": 87, "name": "scissors"},
{"color": [201, 57, 1], "isthing": 1, "id": 88, "name": "teddy bear"},
{"color": [246, 0, 122], "isthing": 1, "id": 89, "name": "hair drier"},
{"color": [191, 162, 208], "isthing": 1, "id": 90, "name": "toothbrush"},
{"color": [255, 255, 128], "isthing": 0, "id": 92, "name": "banner"},
{"color": [147, 211, 203], "isthing": 0, "id": 93, "name": "blanket"},
{"color": [150, 100, 100], "isthing": 0, "id": 95, "name": "bridge"},
{"color": [168, 171, 172], "isthing": 0, "id": 100, "name": "cardboard"},
{"color": [146, 112, 198], "isthing": 0, "id": 107, "name": "counter"},
{"color": [210, 170, 100], "isthing": 0, "id": 109, "name": "curtain"},
{"color": [92, 136, 89], "isthing": 0, "id": 112, "name": "door-stuff"},
{"color": [218, 88, 184], "isthing": 0, "id": 118, "name": "floor-wood"},
{"color": [241, 129, 0], "isthing": 0, "id": 119, "name": "flower"},
{"color": [217, 17, 255], "isthing": 0, "id": 122, "name": "fruit"},
{"color": [124, 74, 181], "isthing": 0, "id": 125, "name": "gravel"},
{"color": [70, 70, 70], "isthing": 0, "id": 128, "name": "house"},
{"color": [255, 228, 255], "isthing": 0, "id": 130, "name": "light"},
{"color": [154, 208, 0], "isthing": 0, "id": 133, "name": "mirror-stuff"},
{"color": [193, 0, 92], "isthing": 0, "id": 138, "name": "net"},
{"color": [76, 91, 113], "isthing": 0, "id": 141, "name": "pillow"},
{"color": [255, 180, 195], "isthing": 0, "id": 144, "name": "platform"},
{"color": [106, 154, 176], "isthing": 0, "id": 145, "name": "playingfield"},
{"color": [230, 150, 140], "isthing": 0, "id": 147, "name": "railroad"},
{"color": [60, 143, 255], "isthing": 0, "id": 148, "name": "river"},
{"color": [128, 64, 128], "isthing": 0, "id": 149, "name": "road"},
{"color": [92, 82, 55], "isthing": 0, "id": 151, "name": "roof"},
{"color": [254, 212, 124], "isthing": 0, "id": 154, "name": "sand"},
{"color": [73, 77, 174], "isthing": 0, "id": 155, "name": "sea"},
{"color": [255, 160, 98], "isthing": 0, "id": 156, "name": "shelf"},
{"color": [255, 255, 255], "isthing": 0, "id": 159, "name": "snow"},
{"color": [104, 84, 109], "isthing": 0, "id": 161, "name": "stairs"},
{"color": [169, 164, 131], "isthing": 0, "id": 166, "name": "tent"},
{"color": [225, 199, 255], "isthing": 0, "id": 168, "name": "towel"},
{"color": [137, 54, 74], "isthing": 0, "id": 171, "name": "wall-brick"},
{"color": [135, 158, 223], "isthing": 0, "id": 175, "name": "wall-stone"},
{"color": [7, 246, 231], "isthing": 0, "id": 176, "name": "wall-tile"},
{"color": [107, 255, 200], "isthing": 0, "id": 177, "name": "wall-wood"},
{"color": [58, 41, 149], "isthing": 0, "id": 178, "name": "water-other"},
{"color": [183, 121, 142], "isthing": 0, "id": 180, "name": "window-blind"},
{"color": [255, 73, 97], "isthing": 0, "id": 181, "name": "window-other"},
{"color": [107, 142, 35], "isthing": 0, "id": 184, "name": "tree-merged"},
{"color": [190, 153, 153], "isthing": 0, "id": 185, "name": "fence-merged"},
{"color": [146, 139, 141], "isthing": 0, "id": 186, "name": "ceiling-merged"},
{"color": [70, 130, 180], "isthing": 0, "id": 187, "name": "sky-other-merged"},
{"color": [134, 199, 156], "isthing": 0, "id": 188, "name": "cabinet-merged"},
{"color": [209, 226, 140], "isthing": 0, "id": 189, "name": "table-merged"},
{"color": [96, 36, 108], "isthing": 0, "id": 190, "name": "floor-other-merged"},
{"color": [96, 96, 96], "isthing": 0, "id": 191, "name": "pavement-merged"},
{"color": [64, 170, 64], "isthing": 0, "id": 192, "name": "mountain-merged"},
{"color": [152, 251, 152], "isthing": 0, "id": 193, "name": "grass-merged"},
{"color": [208, 229, 228], "isthing": 0, "id": 194, "name": "dirt-merged"},
{"color": [206, 186, 171], "isthing": 0, "id": 195, "name": "paper-merged"},
{"color": [152, 161, 64], "isthing": 0, "id": 196, "name": "food-other-merged"},
{"color": [116, 112, 0], "isthing": 0, "id": 197, "name": "building-other-merged"},
{"color": [0, 114, 143], "isthing": 0, "id": 198, "name": "rock-merged"},
{"color": [102, 102, 156], "isthing": 0, "id": 199, "name": "wall-other-merged"},
{"color": [250, 141, 255], "isthing": 0, "id": 200, "name": "rug-merged"},
]
# fmt: off
COCO_PERSON_KEYPOINT_NAMES = (
"nose",
"left_eye", "right_eye",
"left_ear", "right_ear",
"left_shoulder", "right_shoulder",
"left_elbow", "right_elbow",
"left_wrist", "right_wrist",
"left_hip", "right_hip",
"left_knee", "right_knee",
"left_ankle", "right_ankle",
)
# fmt: on
# Pairs of keypoints that should be exchanged under horizontal flipping
COCO_PERSON_KEYPOINT_FLIP_MAP = (
("left_eye", "right_eye"),
("left_ear", "right_ear"),
("left_shoulder", "right_shoulder"),
("left_elbow", "right_elbow"),
("left_wrist", "right_wrist"),
("left_hip", "right_hip"),
("left_knee", "right_knee"),
("left_ankle", "right_ankle"),
)
# rules for pairs of keypoints to draw a line between, and the line color to use.
KEYPOINT_CONNECTION_RULES = [
# face
("left_ear", "left_eye", (102, 204, 255)),
("right_ear", "right_eye", (51, 153, 255)),
("left_eye", "nose", (102, 0, 204)),
("nose", "right_eye", (51, 102, 255)),
# upper-body
("left_shoulder", "right_shoulder", (255, 128, 0)),
("left_shoulder", "left_elbow", (153, 255, 204)),
("right_shoulder", "right_elbow", (128, 229, 255)),
("left_elbow", "left_wrist", (153, 255, 153)),
("right_elbow", "right_wrist", (102, 255, 224)),
# lower-body
("left_hip", "right_hip", (255, 102, 0)),
("left_hip", "left_knee", (255, 255, 77)),
("right_hip", "right_knee", (153, 255, 204)),
("left_knee", "left_ankle", (191, 255, 128)),
("right_knee", "right_ankle", (255, 195, 77)),
]
# All Cityscapes categories, together with their nice-looking visualization colors
# It's from https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/helpers/labels.py # noqa
CITYSCAPES_CATEGORIES = [
{"color": (128, 64, 128), "isthing": 0, "id": 7, "trainId": 0, "name": "road"},
{"color": (244, 35, 232), "isthing": 0, "id": 8, "trainId": 1, "name": "sidewalk"},
{"color": (70, 70, 70), "isthing": 0, "id": 11, "trainId": 2, "name": "building"},
{"color": (102, 102, 156), "isthing": 0, "id": 12, "trainId": 3, "name": "wall"},
{"color": (190, 153, 153), "isthing": 0, "id": 13, "trainId": 4, "name": "fence"},
{"color": (153, 153, 153), "isthing": 0, "id": 17, "trainId": 5, "name": "pole"},
{"color": (250, 170, 30), "isthing": 0, "id": 19, "trainId": 6, "name": "traffic light"},
{"color": (220, 220, 0), "isthing": 0, "id": 20, "trainId": 7, "name": "traffic sign"},
{"color": (107, 142, 35), "isthing": 0, "id": 21, "trainId": 8, "name": "vegetation"},
{"color": (152, 251, 152), "isthing": 0, "id": 22, "trainId": 9, "name": "terrain"},
{"color": (70, 130, 180), "isthing": 0, "id": 23, "trainId": 10, "name": "sky"},
{"color": (220, 20, 60), "isthing": 1, "id": 24, "trainId": 11, "name": "person"},
{"color": (255, 0, 0), "isthing": 1, "id": 25, "trainId": 12, "name": "rider"},
{"color": (0, 0, 142), "isthing": 1, "id": 26, "trainId": 13, "name": "car"},
{"color": (0, 0, 70), "isthing": 1, "id": 27, "trainId": 14, "name": "truck"},
{"color": (0, 60, 100), "isthing": 1, "id": 28, "trainId": 15, "name": "bus"},
{"color": (0, 80, 100), "isthing": 1, "id": 31, "trainId": 16, "name": "train"},
{"color": (0, 0, 230), "isthing": 1, "id": 32, "trainId": 17, "name": "motorcycle"},
{"color": (119, 11, 32), "isthing": 1, "id": 33, "trainId": 18, "name": "bicycle"},
]
# fmt: off
ADE20K_SEM_SEG_CATEGORIES = [
"wall", "building", "sky", "floor", "tree", "ceiling", "road, route", "bed", "window ", "grass", "cabinet", "sidewalk, pavement", "person", "earth, ground", "door", "table", "mountain, mount", "plant", "curtain", "chair", "car", "water", "painting, picture", "sofa", "shelf", "house", "sea", "mirror", "rug", "field", "armchair", "seat", "fence", "desk", "rock, stone", "wardrobe, closet, press", "lamp", "tub", "rail", "cushion", "base, pedestal, stand", "box", "column, pillar", "signboard, sign", "chest of drawers, chest, bureau, dresser", "counter", "sand", "sink", "skyscraper", "fireplace", "refrigerator, icebox", "grandstand, covered stand", "path", "stairs", "runway", "case, display case, showcase, vitrine", "pool table, billiard table, snooker table", "pillow", "screen door, screen", "stairway, staircase", "river", "bridge, span", "bookcase", "blind, screen", "coffee table", "toilet, can, commode, crapper, pot, potty, stool, throne", "flower", "book", "hill", "bench", "countertop", "stove", "palm, palm tree", "kitchen island", "computer", "swivel chair", "boat", "bar", "arcade machine", "hovel, hut, hutch, shack, shanty", "bus", "towel", "light", "truck", "tower", "chandelier", "awning, sunshade, sunblind", "street lamp", "booth", "tv", "plane", "dirt track", "clothes", "pole", "land, ground, soil", "bannister, banister, balustrade, balusters, handrail", "escalator, moving staircase, moving stairway", "ottoman, pouf, pouffe, puff, hassock", "bottle", "buffet, counter, sideboard", "poster, posting, placard, notice, bill, card", "stage", "van", "ship", "fountain", "conveyer belt, conveyor belt, conveyer, conveyor, transporter", "canopy", "washer, automatic washer, washing machine", "plaything, toy", "pool", "stool", "barrel, cask", "basket, handbasket", "falls", "tent", "bag", "minibike, motorbike", "cradle", "oven", "ball", "food, solid food", "step, stair", "tank, storage tank", "trade name", "microwave", "pot", "animal", "bicycle", "lake", "dishwasher", "screen", "blanket, cover", "sculpture", "hood, exhaust hood", "sconce", "vase", "traffic light", "tray", "trash can", "fan", "pier", "crt screen", "plate", "monitor", "bulletin board", "shower", "radiator", "glass, drinking glass", "clock", "flag", # noqa
]
# After processed by `prepare_ade20k_sem_seg.py`, id 255 means ignore
# fmt: on
def _get_coco_instances_meta():
thing_ids = [k["id"] for k in COCO_CATEGORIES if k["isthing"] == 1]
thing_colors = [k["color"] for k in COCO_CATEGORIES if k["isthing"] == 1]
assert len(thing_ids) == 80, len(thing_ids)
# Mapping from the incontiguous COCO category id to an id in [0, 79]
thing_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(thing_ids)}
thing_classes = [k["name"] for k in COCO_CATEGORIES if k["isthing"] == 1]
ret = {
"thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id,
"thing_classes": thing_classes,
"thing_colors": thing_colors,
}
return ret
def _get_coco_panoptic_separated_meta():
"""
Returns metadata for "separated" version of the panoptic segmentation dataset.
"""
stuff_ids = [k["id"] for k in COCO_CATEGORIES if k["isthing"] == 0]
assert len(stuff_ids) == 53, len(stuff_ids)
# For semantic segmentation, this mapping maps from contiguous stuff id
# (in [0, 53], used in models) to ids in the dataset (used for processing results)
# The id 0 is mapped to an extra category "thing".
stuff_dataset_id_to_contiguous_id = {k: i + 1 for i, k in enumerate(stuff_ids)}
# When converting COCO panoptic annotations to semantic annotations
# We label the "thing" category to 0
stuff_dataset_id_to_contiguous_id[0] = 0
# 54 names for COCO stuff categories (including "things")
stuff_classes = ["things"] + [
k["name"].replace("-other", "").replace("-merged", "")
for k in COCO_CATEGORIES
if k["isthing"] == 0
]
# NOTE: I randomly picked a color for things
stuff_colors = [[82, 18, 128]] + [k["color"] for k in COCO_CATEGORIES if k["isthing"] == 0]
ret = {
"stuff_dataset_id_to_contiguous_id": stuff_dataset_id_to_contiguous_id,
"stuff_classes": stuff_classes,
"stuff_colors": stuff_colors,
}
ret.update(_get_coco_instances_meta())
return ret
def _get_builtin_metadata(dataset_name):
if dataset_name == "coco":
return _get_coco_instances_meta()
if dataset_name == "coco_panoptic_separated":
return _get_coco_panoptic_separated_meta()
elif dataset_name == "coco_panoptic_standard":
meta = {}
# The following metadata maps contiguous id from [0, #thing categories +
# #stuff categories) to their names and colors. We have to replica of the
# same name and color under "thing_*" and "stuff_*" because the current
# visualization function in D2 handles thing and class classes differently
# due to some heuristic used in Panoptic FPN. We keep the same naming to
# enable reusing existing visualization functions.
thing_classes = [k["name"] for k in COCO_CATEGORIES]
thing_colors = [k["color"] for k in COCO_CATEGORIES]
stuff_classes = [k["name"] for k in COCO_CATEGORIES]
stuff_colors = [k["color"] for k in COCO_CATEGORIES]
meta["thing_classes"] = thing_classes
meta["thing_colors"] = thing_colors
meta["stuff_classes"] = stuff_classes
meta["stuff_colors"] = stuff_colors
# Convert category id for training:
# category id: like semantic segmentation, it is the class id for each
# pixel. Since there are some classes not used in evaluation, the category
# id is not always contiguous and thus we have two set of category ids:
# - original category id: category id in the original dataset, mainly
# used for evaluation.
# - contiguous category id: [0, #classes), in order to train the linear
# softmax classifier.
thing_dataset_id_to_contiguous_id = {}
stuff_dataset_id_to_contiguous_id = {}
for i, cat in enumerate(COCO_CATEGORIES):
if cat["isthing"]:
thing_dataset_id_to_contiguous_id[cat["id"]] = i
else:
stuff_dataset_id_to_contiguous_id[cat["id"]] = i
meta["thing_dataset_id_to_contiguous_id"] = thing_dataset_id_to_contiguous_id
meta["stuff_dataset_id_to_contiguous_id"] = stuff_dataset_id_to_contiguous_id
return meta
elif dataset_name == "coco_person":
return {
"thing_classes": ["person"],
"keypoint_names": COCO_PERSON_KEYPOINT_NAMES,
"keypoint_flip_map": COCO_PERSON_KEYPOINT_FLIP_MAP,
"keypoint_connection_rules": KEYPOINT_CONNECTION_RULES,
}
elif dataset_name == "cityscapes":
# fmt: off
CITYSCAPES_THING_CLASSES = [
"person", "rider", "car", "truck",
"bus", "train", "motorcycle", "bicycle",
]
CITYSCAPES_STUFF_CLASSES = [
"road", "sidewalk", "building", "wall", "fence", "pole", "traffic light",
"traffic sign", "vegetation", "terrain", "sky", "person", "rider", "car",
"truck", "bus", "train", "motorcycle", "bicycle",
]
# fmt: on
return {
"thing_classes": CITYSCAPES_THING_CLASSES,
"stuff_classes": CITYSCAPES_STUFF_CLASSES,
}
raise KeyError("No built-in metadata for dataset {}".format(dataset_name))
# Copyright (c) Facebook, Inc. and its affiliates.
import functools
import json
import logging
import multiprocessing as mp
import numpy as np
import os
from itertools import chain
import pycocotools.mask as mask_util
from PIL import Image
from detectron2.structures import BoxMode
from detectron2.utils.comm import get_world_size
from detectron2.utils.file_io import PathManager
from detectron2.utils.logger import setup_logger
try:
import cv2 # noqa
except ImportError:
# OpenCV is an optional dependency at the moment
pass
logger = logging.getLogger(__name__)
def _get_cityscapes_files(image_dir, gt_dir):
files = []
# scan through the directory
cities = PathManager.ls(image_dir)
logger.info(f"{len(cities)} cities found in '{image_dir}'.")
for city in cities:
city_img_dir = os.path.join(image_dir, city)
city_gt_dir = os.path.join(gt_dir, city)
for basename in PathManager.ls(city_img_dir):
image_file = os.path.join(city_img_dir, basename)
suffix = "leftImg8bit.png"
assert basename.endswith(suffix), basename
basename = basename[: -len(suffix)]
instance_file = os.path.join(city_gt_dir, basename + "gtFine_instanceIds.png")
label_file = os.path.join(city_gt_dir, basename + "gtFine_labelIds.png")
json_file = os.path.join(city_gt_dir, basename + "gtFine_polygons.json")
files.append((image_file, instance_file, label_file, json_file))
assert len(files), "No images found in {}".format(image_dir)
for f in files[0]:
assert PathManager.isfile(f), f
return files
def load_cityscapes_instances(image_dir, gt_dir, from_json=True, to_polygons=True):
"""
Args:
image_dir (str): path to the raw dataset. e.g., "~/cityscapes/leftImg8bit/train".
gt_dir (str): path to the raw annotations. e.g., "~/cityscapes/gtFine/train".
from_json (bool): whether to read annotations from the raw json file or the png files.
to_polygons (bool): whether to represent the segmentation as polygons
(COCO's format) instead of masks (cityscapes's format).
Returns:
list[dict]: a list of dicts in Detectron2 standard format. (See
`Using Custom Datasets </tutorials/datasets.html>`_ )
"""
if from_json:
assert to_polygons, (
"Cityscapes's json annotations are in polygon format. "
"Converting to mask format is not supported now."
)
files = _get_cityscapes_files(image_dir, gt_dir)
logger.info("Preprocessing cityscapes annotations ...")
# This is still not fast: all workers will execute duplicate works and will
# take up to 10m on a 8GPU server.
pool = mp.Pool(processes=max(mp.cpu_count() // get_world_size() // 2, 4))
ret = pool.map(
functools.partial(_cityscapes_files_to_dict, from_json=from_json, to_polygons=to_polygons),
files,
)
logger.info("Loaded {} images from {}".format(len(ret), image_dir))
# Map cityscape ids to contiguous ids
from cityscapesscripts.helpers.labels import labels
labels = [l for l in labels if l.hasInstances and not l.ignoreInEval]
dataset_id_to_contiguous_id = {l.id: idx for idx, l in enumerate(labels)}
for dict_per_image in ret:
for anno in dict_per_image["annotations"]:
anno["category_id"] = dataset_id_to_contiguous_id[anno["category_id"]]
return ret
def load_cityscapes_semantic(image_dir, gt_dir):
"""
Args:
image_dir (str): path to the raw dataset. e.g., "~/cityscapes/leftImg8bit/train".
gt_dir (str): path to the raw annotations. e.g., "~/cityscapes/gtFine/train".
Returns:
list[dict]: a list of dict, each has "file_name" and
"sem_seg_file_name".
"""
ret = []
# gt_dir is small and contain many small files. make sense to fetch to local first
gt_dir = PathManager.get_local_path(gt_dir)
for image_file, _, label_file, json_file in _get_cityscapes_files(image_dir, gt_dir):
label_file = label_file.replace("labelIds", "labelTrainIds")
with PathManager.open(json_file, "r") as f:
jsonobj = json.load(f)
ret.append(
{
"file_name": image_file,
"sem_seg_file_name": label_file,
"height": jsonobj["imgHeight"],
"width": jsonobj["imgWidth"],
}
)
assert len(ret), f"No images found in {image_dir}!"
assert PathManager.isfile(
ret[0]["sem_seg_file_name"]
), "Please generate labelTrainIds.png with cityscapesscripts/preparation/createTrainIdLabelImgs.py" # noqa
return ret
def _cityscapes_files_to_dict(files, from_json, to_polygons):
"""
Parse cityscapes annotation files to a instance segmentation dataset dict.
Args:
files (tuple): consists of (image_file, instance_id_file, label_id_file, json_file)
from_json (bool): whether to read annotations from the raw json file or the png files.
to_polygons (bool): whether to represent the segmentation as polygons
(COCO's format) instead of masks (cityscapes's format).
Returns:
A dict in Detectron2 Dataset format.
"""
from cityscapesscripts.helpers.labels import id2label, name2label
image_file, instance_id_file, _, json_file = files
annos = []
if from_json:
from shapely.geometry import MultiPolygon, Polygon
with PathManager.open(json_file, "r") as f:
jsonobj = json.load(f)
ret = {
"file_name": image_file,
"image_id": os.path.basename(image_file),
"height": jsonobj["imgHeight"],
"width": jsonobj["imgWidth"],
}
# `polygons_union` contains the union of all valid polygons.
polygons_union = Polygon()
# CityscapesScripts draw the polygons in sequential order
# and each polygon *overwrites* existing ones. See
# (https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/preparation/json2instanceImg.py) # noqa
# We use reverse order, and each polygon *avoids* early ones.
# This will resolve the ploygon overlaps in the same way as CityscapesScripts.
for obj in jsonobj["objects"][::-1]:
if "deleted" in obj: # cityscapes data format specific
continue
label_name = obj["label"]
try:
label = name2label[label_name]
except KeyError:
if label_name.endswith("group"): # crowd area
label = name2label[label_name[: -len("group")]]
else:
raise
if label.id < 0: # cityscapes data format
continue
# Cityscapes's raw annotations uses integer coordinates
# Therefore +0.5 here
poly_coord = np.asarray(obj["polygon"], dtype="f4") + 0.5
# CityscapesScript uses PIL.ImageDraw.polygon to rasterize
# polygons for evaluation. This function operates in integer space
# and draws each pixel whose center falls into the polygon.
# Therefore it draws a polygon which is 0.5 "fatter" in expectation.
# We therefore dilate the input polygon by 0.5 as our input.
poly = Polygon(poly_coord).buffer(0.5, resolution=4)
if not label.hasInstances or label.ignoreInEval:
# even if we won't store the polygon it still contributes to overlaps resolution
polygons_union = polygons_union.union(poly)
continue
# Take non-overlapping part of the polygon
poly_wo_overlaps = poly.difference(polygons_union)
if poly_wo_overlaps.is_empty:
continue
polygons_union = polygons_union.union(poly)
anno = {}
anno["iscrowd"] = label_name.endswith("group")
anno["category_id"] = label.id
if isinstance(poly_wo_overlaps, Polygon):
poly_list = [poly_wo_overlaps]
elif isinstance(poly_wo_overlaps, MultiPolygon):
poly_list = poly_wo_overlaps.geoms
else:
raise NotImplementedError("Unknown geometric structure {}".format(poly_wo_overlaps))
poly_coord = []
for poly_el in poly_list:
# COCO API can work only with exterior boundaries now, hence we store only them.
# TODO: store both exterior and interior boundaries once other parts of the
# codebase support holes in polygons.
poly_coord.append(list(chain(*poly_el.exterior.coords)))
anno["segmentation"] = poly_coord
(xmin, ymin, xmax, ymax) = poly_wo_overlaps.bounds
anno["bbox"] = (xmin, ymin, xmax, ymax)
anno["bbox_mode"] = BoxMode.XYXY_ABS
annos.append(anno)
else:
# See also the official annotation parsing scripts at
# https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/evaluation/instances2dict.py # noqa
with PathManager.open(instance_id_file, "rb") as f:
inst_image = np.asarray(Image.open(f), order="F")
# ids < 24 are stuff labels (filtering them first is about 5% faster)
flattened_ids = np.unique(inst_image[inst_image >= 24])
ret = {
"file_name": image_file,
"image_id": os.path.basename(image_file),
"height": inst_image.shape[0],
"width": inst_image.shape[1],
}
for instance_id in flattened_ids:
# For non-crowd annotations, instance_id // 1000 is the label_id
# Crowd annotations have <1000 instance ids
label_id = instance_id // 1000 if instance_id >= 1000 else instance_id
label = id2label[label_id]
if not label.hasInstances or label.ignoreInEval:
continue
anno = {}
anno["iscrowd"] = instance_id < 1000
anno["category_id"] = label.id
mask = np.asarray(inst_image == instance_id, dtype=np.uint8, order="F")
inds = np.nonzero(mask)
ymin, ymax = inds[0].min(), inds[0].max()
xmin, xmax = inds[1].min(), inds[1].max()
anno["bbox"] = (xmin, ymin, xmax, ymax)
if xmax <= xmin or ymax <= ymin:
continue
anno["bbox_mode"] = BoxMode.XYXY_ABS
if to_polygons:
# This conversion comes from D4809743 and D5171122,
# when Mask-RCNN was first developed.
contours = cv2.findContours(mask.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)[
-2
]
polygons = [c.reshape(-1).tolist() for c in contours if len(c) >= 3]
# opencv's can produce invalid polygons
if len(polygons) == 0:
continue
anno["segmentation"] = polygons
else:
anno["segmentation"] = mask_util.encode(mask[:, :, None])[0]
annos.append(anno)
ret["annotations"] = annos
return ret
if __name__ == "__main__":
"""
Test the cityscapes dataset loader.
Usage:
python -m detectron2.data.datasets.cityscapes \
cityscapes/leftImg8bit/train cityscapes/gtFine/train
"""
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("image_dir")
parser.add_argument("gt_dir")
parser.add_argument("--type", choices=["instance", "semantic"], default="instance")
args = parser.parse_args()
from detectron2.data.catalog import Metadata
from detectron2.utils.visualizer import Visualizer
from cityscapesscripts.helpers.labels import labels
logger = setup_logger(name=__name__)
dirname = "cityscapes-data-vis"
os.makedirs(dirname, exist_ok=True)
if args.type == "instance":
dicts = load_cityscapes_instances(
args.image_dir, args.gt_dir, from_json=True, to_polygons=True
)
logger.info("Done loading {} samples.".format(len(dicts)))
thing_classes = [k.name for k in labels if k.hasInstances and not k.ignoreInEval]
meta = Metadata().set(thing_classes=thing_classes)
else:
dicts = load_cityscapes_semantic(args.image_dir, args.gt_dir)
logger.info("Done loading {} samples.".format(len(dicts)))
stuff_classes = [k.name for k in labels if k.trainId != 255]
stuff_colors = [k.color for k in labels if k.trainId != 255]
meta = Metadata().set(stuff_classes=stuff_classes, stuff_colors=stuff_colors)
for d in dicts:
img = np.array(Image.open(PathManager.open(d["file_name"], "rb")))
visualizer = Visualizer(img, metadata=meta)
vis = visualizer.draw_dataset_dict(d)
# cv2.imshow("a", vis.get_image()[:, :, ::-1])
# cv2.waitKey()
fpath = os.path.join(dirname, os.path.basename(d["file_name"]))
vis.save(fpath)
# Copyright (c) Facebook, Inc. and its affiliates.
import json
import logging
import os
from detectron2.data import DatasetCatalog, MetadataCatalog
from detectron2.data.datasets.builtin_meta import CITYSCAPES_CATEGORIES
from detectron2.utils.file_io import PathManager
"""
This file contains functions to register the Cityscapes panoptic dataset to the DatasetCatalog.
"""
logger = logging.getLogger(__name__)
def get_cityscapes_panoptic_files(image_dir, gt_dir, json_info):
files = []
# scan through the directory
cities = PathManager.ls(image_dir)
logger.info(f"{len(cities)} cities found in '{image_dir}'.")
image_dict = {}
for city in cities:
city_img_dir = os.path.join(image_dir, city)
for basename in PathManager.ls(city_img_dir):
image_file = os.path.join(city_img_dir, basename)
suffix = "_leftImg8bit.png"
assert basename.endswith(suffix), basename
basename = os.path.basename(basename)[: -len(suffix)]
image_dict[basename] = image_file
for ann in json_info["annotations"]:
image_file = image_dict.get(ann["image_id"], None)
assert image_file is not None, "No image {} found for annotation {}".format(
ann["image_id"], ann["file_name"]
)
label_file = os.path.join(gt_dir, ann["file_name"])
segments_info = ann["segments_info"]
files.append((image_file, label_file, segments_info))
assert len(files), "No images found in {}".format(image_dir)
assert PathManager.isfile(files[0][0]), files[0][0]
assert PathManager.isfile(files[0][1]), files[0][1]
return files
def load_cityscapes_panoptic(image_dir, gt_dir, gt_json, meta):
"""
Args:
image_dir (str): path to the raw dataset. e.g., "~/cityscapes/leftImg8bit/train".
gt_dir (str): path to the raw annotations. e.g.,
"~/cityscapes/gtFine/cityscapes_panoptic_train".
gt_json (str): path to the json file. e.g.,
"~/cityscapes/gtFine/cityscapes_panoptic_train.json".
meta (dict): dictionary containing "thing_dataset_id_to_contiguous_id"
and "stuff_dataset_id_to_contiguous_id" to map category ids to
contiguous ids for training.
Returns:
list[dict]: a list of dicts in Detectron2 standard format. (See
`Using Custom Datasets </tutorials/datasets.html>`_ )
"""
def _convert_category_id(segment_info, meta):
if segment_info["category_id"] in meta["thing_dataset_id_to_contiguous_id"]:
segment_info["category_id"] = meta["thing_dataset_id_to_contiguous_id"][
segment_info["category_id"]
]
else:
segment_info["category_id"] = meta["stuff_dataset_id_to_contiguous_id"][
segment_info["category_id"]
]
return segment_info
assert os.path.exists(
gt_json
), "Please run `python cityscapesscripts/preparation/createPanopticImgs.py` to generate label files." # noqa
with open(gt_json) as f:
json_info = json.load(f)
files = get_cityscapes_panoptic_files(image_dir, gt_dir, json_info)
ret = []
for image_file, label_file, segments_info in files:
sem_label_file = (
image_file.replace("leftImg8bit", "gtFine").split(".")[0] + "_labelTrainIds.png"
)
segments_info = [_convert_category_id(x, meta) for x in segments_info]
ret.append(
{
"file_name": image_file,
"image_id": "_".join(
os.path.splitext(os.path.basename(image_file))[0].split("_")[:3]
),
"sem_seg_file_name": sem_label_file,
"pan_seg_file_name": label_file,
"segments_info": segments_info,
}
)
assert len(ret), f"No images found in {image_dir}!"
assert PathManager.isfile(
ret[0]["sem_seg_file_name"]
), "Please generate labelTrainIds.png with cityscapesscripts/preparation/createTrainIdLabelImgs.py" # noqa
assert PathManager.isfile(
ret[0]["pan_seg_file_name"]
), "Please generate panoptic annotation with python cityscapesscripts/preparation/createPanopticImgs.py" # noqa
return ret
_RAW_CITYSCAPES_PANOPTIC_SPLITS = {
"cityscapes_fine_panoptic_train": (
"cityscapes/leftImg8bit/train",
"cityscapes/gtFine/cityscapes_panoptic_train",
"cityscapes/gtFine/cityscapes_panoptic_train.json",
),
"cityscapes_fine_panoptic_val": (
"cityscapes/leftImg8bit/val",
"cityscapes/gtFine/cityscapes_panoptic_val",
"cityscapes/gtFine/cityscapes_panoptic_val.json",
),
# "cityscapes_fine_panoptic_test": not supported yet
}
def register_all_cityscapes_panoptic(root):
meta = {}
# The following metadata maps contiguous id from [0, #thing categories +
# #stuff categories) to their names and colors. We have to replica of the
# same name and color under "thing_*" and "stuff_*" because the current
# visualization function in D2 handles thing and class classes differently
# due to some heuristic used in Panoptic FPN. We keep the same naming to
# enable reusing existing visualization functions.
thing_classes = [k["name"] for k in CITYSCAPES_CATEGORIES]
thing_colors = [k["color"] for k in CITYSCAPES_CATEGORIES]
stuff_classes = [k["name"] for k in CITYSCAPES_CATEGORIES]
stuff_colors = [k["color"] for k in CITYSCAPES_CATEGORIES]
meta["thing_classes"] = thing_classes
meta["thing_colors"] = thing_colors
meta["stuff_classes"] = stuff_classes
meta["stuff_colors"] = stuff_colors
# There are three types of ids in cityscapes panoptic segmentation:
# (1) category id: like semantic segmentation, it is the class id for each
# pixel. Since there are some classes not used in evaluation, the category
# id is not always contiguous and thus we have two set of category ids:
# - original category id: category id in the original dataset, mainly
# used for evaluation.
# - contiguous category id: [0, #classes), in order to train the classifier
# (2) instance id: this id is used to differentiate different instances from
# the same category. For "stuff" classes, the instance id is always 0; for
# "thing" classes, the instance id starts from 1 and 0 is reserved for
# ignored instances (e.g. crowd annotation).
# (3) panoptic id: this is the compact id that encode both category and
# instance id by: category_id * 1000 + instance_id.
thing_dataset_id_to_contiguous_id = {}
stuff_dataset_id_to_contiguous_id = {}
for k in CITYSCAPES_CATEGORIES:
if k["isthing"] == 1:
thing_dataset_id_to_contiguous_id[k["id"]] = k["trainId"]
else:
stuff_dataset_id_to_contiguous_id[k["id"]] = k["trainId"]
meta["thing_dataset_id_to_contiguous_id"] = thing_dataset_id_to_contiguous_id
meta["stuff_dataset_id_to_contiguous_id"] = stuff_dataset_id_to_contiguous_id
for key, (image_dir, gt_dir, gt_json) in _RAW_CITYSCAPES_PANOPTIC_SPLITS.items():
image_dir = os.path.join(root, image_dir)
gt_dir = os.path.join(root, gt_dir)
gt_json = os.path.join(root, gt_json)
DatasetCatalog.register(
key, lambda x=image_dir, y=gt_dir, z=gt_json: load_cityscapes_panoptic(x, y, z, meta)
)
MetadataCatalog.get(key).set(
panoptic_root=gt_dir,
image_root=image_dir,
panoptic_json=gt_json,
gt_dir=gt_dir.replace("cityscapes_panoptic_", ""),
evaluator_type="cityscapes_panoptic_seg",
ignore_label=255,
label_divisor=1000,
**meta,
)
# Copyright (c) Facebook, Inc. and its affiliates.
import contextlib
import datetime
import io
import json
import logging
import numpy as np
import os
import shutil
import pycocotools.mask as mask_util
from fvcore.common.timer import Timer
from iopath.common.file_io import file_lock
from PIL import Image
from detectron2.structures import Boxes, BoxMode, PolygonMasks, RotatedBoxes
from detectron2.utils.file_io import PathManager
from .. import DatasetCatalog, MetadataCatalog
"""
This file contains functions to parse COCO-format annotations into dicts in "Detectron2 format".
"""
logger = logging.getLogger(__name__)
__all__ = ["load_coco_json", "load_sem_seg", "convert_to_coco_json", "register_coco_instances"]
def load_coco_json(json_file, image_root, dataset_name=None, extra_annotation_keys=None):
"""
Load a json file with COCO's instances annotation format.
Currently supports instance detection, instance segmentation,
and person keypoints annotations.
Args:
json_file (str): full path to the json file in COCO instances annotation format.
image_root (str or path-like): the directory where the images in this json file exists.
dataset_name (str or None): the name of the dataset (e.g., coco_2017_train).
When provided, this function will also do the following:
* Put "thing_classes" into the metadata associated with this dataset.
* Map the category ids into a contiguous range (needed by standard dataset format),
and add "thing_dataset_id_to_contiguous_id" to the metadata associated
with this dataset.
This option should usually be provided, unless users need to load
the original json content and apply more processing manually.
extra_annotation_keys (list[str]): list of per-annotation keys that should also be
loaded into the dataset dict (besides "iscrowd", "bbox", "keypoints",
"category_id", "segmentation"). The values for these keys will be returned as-is.
For example, the densepose annotations are loaded in this way.
Returns:
list[dict]: a list of dicts in Detectron2 standard dataset dicts format (See
`Using Custom Datasets </tutorials/datasets.html>`_ ) when `dataset_name` is not None.
If `dataset_name` is None, the returned `category_ids` may be
incontiguous and may not conform to the Detectron2 standard format.
Notes:
1. This function does not read the image files.
The results do not have the "image" field.
"""
from pycocotools.coco import COCO
timer = Timer()
json_file = PathManager.get_local_path(json_file)
with contextlib.redirect_stdout(io.StringIO()):
coco_api = COCO(json_file)
if timer.seconds() > 1:
logger.info("Loading {} takes {:.2f} seconds.".format(json_file, timer.seconds()))
id_map = None
if dataset_name is not None:
meta = MetadataCatalog.get(dataset_name)
cat_ids = sorted(coco_api.getCatIds())
cats = coco_api.loadCats(cat_ids)
# The categories in a custom json file may not be sorted.
thing_classes = [c["name"] for c in sorted(cats, key=lambda x: x["id"])]
meta.thing_classes = thing_classes
# In COCO, certain category ids are artificially removed,
# and by convention they are always ignored.
# We deal with COCO's id issue and translate
# the category ids to contiguous ids in [0, 80).
# It works by looking at the "categories" field in the json, therefore
# if users' own json also have incontiguous ids, we'll
# apply this mapping as well but print a warning.
if not (min(cat_ids) == 1 and max(cat_ids) == len(cat_ids)):
if "coco" not in dataset_name:
logger.warning(
"""
Category ids in annotations are not in [1, #categories]! We'll apply a mapping for you.
"""
)
id_map = {v: i for i, v in enumerate(cat_ids)}
meta.thing_dataset_id_to_contiguous_id = id_map
# sort indices for reproducible results
img_ids = sorted(coco_api.imgs.keys())
# imgs is a list of dicts, each looks something like:
# {'license': 4,
# 'url': 'http://farm6.staticflickr.com/5454/9413846304_881d5e5c3b_z.jpg',
# 'file_name': 'COCO_val2014_000000001268.jpg',
# 'height': 427,
# 'width': 640,
# 'date_captured': '2013-11-17 05:57:24',
# 'id': 1268}
imgs = coco_api.loadImgs(img_ids)
# anns is a list[list[dict]], where each dict is an annotation
# record for an object. The inner list enumerates the objects in an image
# and the outer list enumerates over images. Example of anns[0]:
# [{'segmentation': [[192.81,
# 247.09,
# ...
# 219.03,
# 249.06]],
# 'area': 1035.749,
# 'iscrowd': 0,
# 'image_id': 1268,
# 'bbox': [192.81, 224.8, 74.73, 33.43],
# 'category_id': 16,
# 'id': 42986},
# ...]
anns = [coco_api.imgToAnns[img_id] for img_id in img_ids]
total_num_valid_anns = sum([len(x) for x in anns])
total_num_anns = len(coco_api.anns)
if total_num_valid_anns < total_num_anns:
logger.warning(
f"{json_file} contains {total_num_anns} annotations, but only "
f"{total_num_valid_anns} of them match to images in the file."
)
if "minival" not in json_file:
# The popular valminusminival & minival annotations for COCO2014 contain this bug.
# However the ratio of buggy annotations there is tiny and does not affect accuracy.
# Therefore we explicitly white-list them.
ann_ids = [ann["id"] for anns_per_image in anns for ann in anns_per_image]
assert len(set(ann_ids)) == len(ann_ids), "Annotation ids in '{}' are not unique!".format(
json_file
)
imgs_anns = list(zip(imgs, anns))
logger.info("Loaded {} images in COCO format from {}".format(len(imgs_anns), json_file))
dataset_dicts = []
ann_keys = ["iscrowd", "bbox", "keypoints", "category_id"] + (extra_annotation_keys or [])
num_instances_without_valid_segmentation = 0
for (img_dict, anno_dict_list) in imgs_anns:
record = {}
record["file_name"] = os.path.join(image_root, img_dict["file_name"])
record["height"] = img_dict["height"]
record["width"] = img_dict["width"]
image_id = record["image_id"] = img_dict["id"]
objs = []
for anno in anno_dict_list:
# Check that the image_id in this annotation is the same as
# the image_id we're looking at.
# This fails only when the data parsing logic or the annotation file is buggy.
# The original COCO valminusminival2014 & minival2014 annotation files
# actually contains bugs that, together with certain ways of using COCO API,
# can trigger this assertion.
assert anno["image_id"] == image_id
assert anno.get("ignore", 0) == 0, '"ignore" in COCO json file is not supported.'
obj = {key: anno[key] for key in ann_keys if key in anno}
if "bbox" in obj and len(obj["bbox"]) == 0:
raise ValueError(
f"One annotation of image {image_id} contains empty 'bbox' value! "
"This json does not have valid COCO format."
)
segm = anno.get("segmentation", None)
if segm: # either list[list[float]] or dict(RLE)
if isinstance(segm, dict):
if isinstance(segm["counts"], list):
# convert to compressed RLE
segm = mask_util.frPyObjects(segm, *segm["size"])
else:
# filter out invalid polygons (< 3 points)
segm = [poly for poly in segm if len(poly) % 2 == 0 and len(poly) >= 6]
if len(segm) == 0:
num_instances_without_valid_segmentation += 1
continue # ignore this instance
obj["segmentation"] = segm
keypts = anno.get("keypoints", None)
if keypts: # list[int]
for idx, v in enumerate(keypts):
if idx % 3 != 2:
# COCO's segmentation coordinates are floating points in [0, H or W],
# but keypoint coordinates are integers in [0, H-1 or W-1]
# Therefore we assume the coordinates are "pixel indices" and
# add 0.5 to convert to floating point coordinates.
keypts[idx] = v + 0.5
obj["keypoints"] = keypts
obj["bbox_mode"] = BoxMode.XYWH_ABS
if id_map:
annotation_category_id = obj["category_id"]
try:
obj["category_id"] = id_map[annotation_category_id]
except KeyError as e:
raise KeyError(
f"Encountered category_id={annotation_category_id} "
"but this id does not exist in 'categories' of the json file."
) from e
objs.append(obj)
record["annotations"] = objs
dataset_dicts.append(record)
if num_instances_without_valid_segmentation > 0:
logger.warning(
"Filtered out {} instances without valid segmentation. ".format(
num_instances_without_valid_segmentation
)
+ "There might be issues in your dataset generation process. Please "
"check https://detectron2.readthedocs.io/en/latest/tutorials/datasets.html carefully"
)
return dataset_dicts
def load_sem_seg(gt_root, image_root, gt_ext="png", image_ext="jpg"):
"""
Load semantic segmentation datasets. All files under "gt_root" with "gt_ext" extension are
treated as ground truth annotations and all files under "image_root" with "image_ext" extension
as input images. Ground truth and input images are matched using file paths relative to
"gt_root" and "image_root" respectively without taking into account file extensions.
This works for COCO as well as some other datasets.
Args:
gt_root (str): full path to ground truth semantic segmentation files. Semantic segmentation
annotations are stored as images with integer values in pixels that represent
corresponding semantic labels.
image_root (str): the directory where the input images are.
gt_ext (str): file extension for ground truth annotations.
image_ext (str): file extension for input images.
Returns:
list[dict]:
a list of dicts in detectron2 standard format without instance-level
annotation.
Notes:
1. This function does not read the image and ground truth files.
The results do not have the "image" and "sem_seg" fields.
"""
# We match input images with ground truth based on their relative filepaths (without file
# extensions) starting from 'image_root' and 'gt_root' respectively.
def file2id(folder_path, file_path):
# extract relative path starting from `folder_path`
image_id = os.path.normpath(os.path.relpath(file_path, start=folder_path))
# remove file extension
image_id = os.path.splitext(image_id)[0]
return image_id
input_files = sorted(
(os.path.join(image_root, f) for f in PathManager.ls(image_root) if f.endswith(image_ext)),
key=lambda file_path: file2id(image_root, file_path),
)
gt_files = sorted(
(os.path.join(gt_root, f) for f in PathManager.ls(gt_root) if f.endswith(gt_ext)),
key=lambda file_path: file2id(gt_root, file_path),
)
assert len(gt_files) > 0, "No annotations found in {}.".format(gt_root)
# Use the intersection, so that val2017_100 annotations can run smoothly with val2017 images
if len(input_files) != len(gt_files):
logger.warn(
"Directory {} and {} has {} and {} files, respectively.".format(
image_root, gt_root, len(input_files), len(gt_files)
)
)
input_basenames = [os.path.basename(f)[: -len(image_ext)] for f in input_files]
gt_basenames = [os.path.basename(f)[: -len(gt_ext)] for f in gt_files]
intersect = list(set(input_basenames) & set(gt_basenames))
# sort, otherwise each worker may obtain a list[dict] in different order
intersect = sorted(intersect)
logger.warn("Will use their intersection of {} files.".format(len(intersect)))
input_files = [os.path.join(image_root, f + image_ext) for f in intersect]
gt_files = [os.path.join(gt_root, f + gt_ext) for f in intersect]
logger.info(
"Loaded {} images with semantic segmentation from {}".format(len(input_files), image_root)
)
dataset_dicts = []
for (img_path, gt_path) in zip(input_files, gt_files):
record = {}
record["file_name"] = img_path
record["sem_seg_file_name"] = gt_path
dataset_dicts.append(record)
return dataset_dicts
def convert_to_coco_dict(dataset_name):
"""
Convert an instance detection/segmentation or keypoint detection dataset
in detectron2's standard format into COCO json format.
Generic dataset description can be found here:
https://detectron2.readthedocs.io/tutorials/datasets.html#register-a-dataset
COCO data format description can be found here:
http://cocodataset.org/#format-data
Args:
dataset_name (str):
name of the source dataset
Must be registered in DatastCatalog and in detectron2's standard format.
Must have corresponding metadata "thing_classes"
Returns:
coco_dict: serializable dict in COCO json format
"""
dataset_dicts = DatasetCatalog.get(dataset_name)
metadata = MetadataCatalog.get(dataset_name)
# unmap the category mapping ids for COCO
if hasattr(metadata, "thing_dataset_id_to_contiguous_id"):
reverse_id_mapping = {v: k for k, v in metadata.thing_dataset_id_to_contiguous_id.items()}
reverse_id_mapper = lambda contiguous_id: reverse_id_mapping[contiguous_id] # noqa
else:
reverse_id_mapper = lambda contiguous_id: contiguous_id # noqa
categories = [
{"id": reverse_id_mapper(id), "name": name}
for id, name in enumerate(metadata.thing_classes)
]
logger.info("Converting dataset dicts into COCO format")
coco_images = []
coco_annotations = []
for image_id, image_dict in enumerate(dataset_dicts):
coco_image = {
"id": image_dict.get("image_id", image_id),
"width": int(image_dict["width"]),
"height": int(image_dict["height"]),
"file_name": str(image_dict["file_name"]),
}
coco_images.append(coco_image)
anns_per_image = image_dict.get("annotations", [])
for annotation in anns_per_image:
# create a new dict with only COCO fields
coco_annotation = {}
# COCO requirement: XYWH box format for axis-align and XYWHA for rotated
bbox = annotation["bbox"]
if isinstance(bbox, np.ndarray):
if bbox.ndim != 1:
raise ValueError(f"bbox has to be 1-dimensional. Got shape={bbox.shape}.")
bbox = bbox.tolist()
if len(bbox) not in [4, 5]:
raise ValueError(f"bbox has to has length 4 or 5. Got {bbox}.")
from_bbox_mode = annotation["bbox_mode"]
to_bbox_mode = BoxMode.XYWH_ABS if len(bbox) == 4 else BoxMode.XYWHA_ABS
bbox = BoxMode.convert(bbox, from_bbox_mode, to_bbox_mode)
# COCO requirement: instance area
if "segmentation" in annotation:
# Computing areas for instances by counting the pixels
segmentation = annotation["segmentation"]
# TODO: check segmentation type: RLE, BinaryMask or Polygon
if isinstance(segmentation, list):
polygons = PolygonMasks([segmentation])
area = polygons.area()[0].item()
elif isinstance(segmentation, dict): # RLE
area = mask_util.area(segmentation).item()
else:
raise TypeError(f"Unknown segmentation type {type(segmentation)}!")
else:
# Computing areas using bounding boxes
if to_bbox_mode == BoxMode.XYWH_ABS:
bbox_xy = BoxMode.convert(bbox, to_bbox_mode, BoxMode.XYXY_ABS)
area = Boxes([bbox_xy]).area()[0].item()
else:
area = RotatedBoxes([bbox]).area()[0].item()
if "keypoints" in annotation:
keypoints = annotation["keypoints"] # list[int]
for idx, v in enumerate(keypoints):
if idx % 3 != 2:
# COCO's segmentation coordinates are floating points in [0, H or W],
# but keypoint coordinates are integers in [0, H-1 or W-1]
# For COCO format consistency we substract 0.5
# https://github.com/facebookresearch/detectron2/pull/175#issuecomment-551202163
keypoints[idx] = v - 0.5
if "num_keypoints" in annotation:
num_keypoints = annotation["num_keypoints"]
else:
num_keypoints = sum(kp > 0 for kp in keypoints[2::3])
# COCO requirement:
# linking annotations to images
# "id" field must start with 1
coco_annotation["id"] = len(coco_annotations) + 1
coco_annotation["image_id"] = coco_image["id"]
coco_annotation["bbox"] = [round(float(x), 3) for x in bbox]
coco_annotation["area"] = float(area)
coco_annotation["iscrowd"] = int(annotation.get("iscrowd", 0))
coco_annotation["category_id"] = int(reverse_id_mapper(annotation["category_id"]))
# Add optional fields
if "keypoints" in annotation:
coco_annotation["keypoints"] = keypoints
coco_annotation["num_keypoints"] = num_keypoints
if "segmentation" in annotation:
seg = coco_annotation["segmentation"] = annotation["segmentation"]
if isinstance(seg, dict): # RLE
counts = seg["counts"]
if not isinstance(counts, str):
# make it json-serializable
seg["counts"] = counts.decode("ascii")
coco_annotations.append(coco_annotation)
logger.info(
"Conversion finished, "
f"#images: {len(coco_images)}, #annotations: {len(coco_annotations)}"
)
info = {
"date_created": str(datetime.datetime.now()),
"description": "Automatically generated COCO json file for Detectron2.",
}
coco_dict = {"info": info, "images": coco_images, "categories": categories, "licenses": None}
if len(coco_annotations) > 0:
coco_dict["annotations"] = coco_annotations
return coco_dict
def convert_to_coco_json(dataset_name, output_file, allow_cached=True):
"""
Converts dataset into COCO format and saves it to a json file.
dataset_name must be registered in DatasetCatalog and in detectron2's standard format.
Args:
dataset_name:
reference from the config file to the catalogs
must be registered in DatasetCatalog and in detectron2's standard format
output_file: path of json file that will be saved to
allow_cached: if json file is already present then skip conversion
"""
# TODO: The dataset or the conversion script *may* change,
# a checksum would be useful for validating the cached data
PathManager.mkdirs(os.path.dirname(output_file))
with file_lock(output_file):
if PathManager.exists(output_file) and allow_cached:
logger.warning(
f"Using previously cached COCO format annotations at '{output_file}'. "
"You need to clear the cache file if your dataset has been modified."
)
else:
logger.info(f"Converting annotations of dataset '{dataset_name}' to COCO format ...)")
coco_dict = convert_to_coco_dict(dataset_name)
logger.info(f"Caching COCO format annotations at '{output_file}' ...")
tmp_file = output_file + ".tmp"
with PathManager.open(tmp_file, "w") as f:
json.dump(coco_dict, f)
shutil.move(tmp_file, output_file)
def register_coco_instances(name, metadata, json_file, image_root):
"""
Register a dataset in COCO's json annotation format for
instance detection, instance segmentation and keypoint detection.
(i.e., Type 1 and 2 in http://cocodataset.org/#format-data.
`instances*.json` and `person_keypoints*.json` in the dataset).
This is an example of how to register a new dataset.
You can do something similar to this function, to register new datasets.
Args:
name (str): the name that identifies a dataset, e.g. "coco_2014_train".
metadata (dict): extra metadata associated with this dataset. You can
leave it as an empty dict.
json_file (str): path to the json instance annotation file.
image_root (str or path-like): directory which contains all the images.
"""
assert isinstance(name, str), name
assert isinstance(json_file, (str, os.PathLike)), json_file
assert isinstance(image_root, (str, os.PathLike)), image_root
# 1. register a function which returns dicts
DatasetCatalog.register(name, lambda: load_coco_json(json_file, image_root, name))
# 2. Optionally, add metadata about this dataset,
# since they might be useful in evaluation, visualization or logging
MetadataCatalog.get(name).set(
json_file=json_file, image_root=image_root, evaluator_type="coco", **metadata
)
if __name__ == "__main__":
"""
Test the COCO json dataset loader.
Usage:
python -m detectron2.data.datasets.coco \
path/to/json path/to/image_root dataset_name
"dataset_name" can be "coco_2014_minival_100", or other
pre-registered ones
"""
from detectron2.utils.logger import setup_logger
from detectron2.utils.visualizer import Visualizer
import detectron2.data.datasets # noqa # add pre-defined metadata
import sys
logger = setup_logger(name=__name__)
assert sys.argv[3] in DatasetCatalog.list()
meta = MetadataCatalog.get(sys.argv[3])
dicts = load_coco_json(sys.argv[1], sys.argv[2], sys.argv[3])
logger.info("Done loading {} samples.".format(len(dicts)))
dirname = "coco-data-vis"
os.makedirs(dirname, exist_ok=True)
for d in dicts:
img = np.array(Image.open(d["file_name"]))
visualizer = Visualizer(img, metadata=meta)
vis = visualizer.draw_dataset_dict(d)
fpath = os.path.join(dirname, os.path.basename(d["file_name"]))
vis.save(fpath)
# Copyright (c) Facebook, Inc. and its affiliates.
import copy
import json
import os
from detectron2.data import DatasetCatalog, MetadataCatalog
from detectron2.utils.file_io import PathManager
from .coco import load_coco_json, load_sem_seg
__all__ = ["register_coco_panoptic", "register_coco_panoptic_separated"]
def load_coco_panoptic_json(json_file, image_dir, gt_dir, meta):
"""
Args:
image_dir (str): path to the raw dataset. e.g., "~/coco/train2017".
gt_dir (str): path to the raw annotations. e.g., "~/coco/panoptic_train2017".
json_file (str): path to the json file. e.g., "~/coco/annotations/panoptic_train2017.json".
Returns:
list[dict]: a list of dicts in Detectron2 standard format. (See
`Using Custom Datasets </tutorials/datasets.html>`_ )
"""
def _convert_category_id(segment_info, meta):
if segment_info["category_id"] in meta["thing_dataset_id_to_contiguous_id"]:
segment_info["category_id"] = meta["thing_dataset_id_to_contiguous_id"][
segment_info["category_id"]
]
segment_info["isthing"] = True
else:
segment_info["category_id"] = meta["stuff_dataset_id_to_contiguous_id"][
segment_info["category_id"]
]
segment_info["isthing"] = False
return segment_info
with PathManager.open(json_file) as f:
json_info = json.load(f)
ret = []
for ann in json_info["annotations"]:
image_id = int(ann["image_id"])
# TODO: currently we assume image and label has the same filename but
# different extension, and images have extension ".jpg" for COCO. Need
# to make image extension a user-provided argument if we extend this
# function to support other COCO-like datasets.
image_file = os.path.join(image_dir, os.path.splitext(ann["file_name"])[0] + ".jpg")
label_file = os.path.join(gt_dir, ann["file_name"])
segments_info = [_convert_category_id(x, meta) for x in ann["segments_info"]]
ret.append(
{
"file_name": image_file,
"image_id": image_id,
"pan_seg_file_name": label_file,
"segments_info": segments_info,
}
)
assert len(ret), f"No images found in {image_dir}!"
assert PathManager.isfile(ret[0]["file_name"]), ret[0]["file_name"]
assert PathManager.isfile(ret[0]["pan_seg_file_name"]), ret[0]["pan_seg_file_name"]
return ret
def register_coco_panoptic(
name, metadata, image_root, panoptic_root, panoptic_json, instances_json=None
):
"""
Register a "standard" version of COCO panoptic segmentation dataset named `name`.
The dictionaries in this registered dataset follows detectron2's standard format.
Hence it's called "standard".
Args:
name (str): the name that identifies a dataset,
e.g. "coco_2017_train_panoptic"
metadata (dict): extra metadata associated with this dataset.
image_root (str): directory which contains all the images
panoptic_root (str): directory which contains panoptic annotation images in COCO format
panoptic_json (str): path to the json panoptic annotation file in COCO format
sem_seg_root (none): not used, to be consistent with
`register_coco_panoptic_separated`.
instances_json (str): path to the json instance annotation file
"""
panoptic_name = name
DatasetCatalog.register(
panoptic_name,
lambda: load_coco_panoptic_json(panoptic_json, image_root, panoptic_root, metadata),
)
MetadataCatalog.get(panoptic_name).set(
panoptic_root=panoptic_root,
image_root=image_root,
panoptic_json=panoptic_json,
json_file=instances_json,
evaluator_type="coco_panoptic_seg",
ignore_label=255,
label_divisor=1000,
**metadata,
)
def register_coco_panoptic_separated(
name, metadata, image_root, panoptic_root, panoptic_json, sem_seg_root, instances_json
):
"""
Register a "separated" version of COCO panoptic segmentation dataset named `name`.
The annotations in this registered dataset will contain both instance annotations and
semantic annotations, each with its own contiguous ids. Hence it's called "separated".
It follows the setting used by the PanopticFPN paper:
1. The instance annotations directly come from polygons in the COCO
instances annotation task, rather than from the masks in the COCO panoptic annotations.
The two format have small differences:
Polygons in the instance annotations may have overlaps.
The mask annotations are produced by labeling the overlapped polygons
with depth ordering.
2. The semantic annotations are converted from panoptic annotations, where
all "things" are assigned a semantic id of 0.
All semantic categories will therefore have ids in contiguous
range [1, #stuff_categories].
This function will also register a pure semantic segmentation dataset
named ``name + '_stuffonly'``.
Args:
name (str): the name that identifies a dataset,
e.g. "coco_2017_train_panoptic"
metadata (dict): extra metadata associated with this dataset.
image_root (str): directory which contains all the images
panoptic_root (str): directory which contains panoptic annotation images
panoptic_json (str): path to the json panoptic annotation file
sem_seg_root (str): directory which contains all the ground truth segmentation annotations.
instances_json (str): path to the json instance annotation file
"""
panoptic_name = name + "_separated"
DatasetCatalog.register(
panoptic_name,
lambda: merge_to_panoptic(
load_coco_json(instances_json, image_root, panoptic_name),
load_sem_seg(sem_seg_root, image_root),
),
)
MetadataCatalog.get(panoptic_name).set(
panoptic_root=panoptic_root,
image_root=image_root,
panoptic_json=panoptic_json,
sem_seg_root=sem_seg_root,
json_file=instances_json, # TODO rename
evaluator_type="coco_panoptic_seg",
ignore_label=255,
**metadata,
)
semantic_name = name + "_stuffonly"
DatasetCatalog.register(semantic_name, lambda: load_sem_seg(sem_seg_root, image_root))
MetadataCatalog.get(semantic_name).set(
sem_seg_root=sem_seg_root,
image_root=image_root,
evaluator_type="sem_seg",
ignore_label=255,
**metadata,
)
def merge_to_panoptic(detection_dicts, sem_seg_dicts):
"""
Create dataset dicts for panoptic segmentation, by
merging two dicts using "file_name" field to match their entries.
Args:
detection_dicts (list[dict]): lists of dicts for object detection or instance segmentation.
sem_seg_dicts (list[dict]): lists of dicts for semantic segmentation.
Returns:
list[dict] (one per input image): Each dict contains all (key, value) pairs from dicts in
both detection_dicts and sem_seg_dicts that correspond to the same image.
The function assumes that the same key in different dicts has the same value.
"""
results = []
sem_seg_file_to_entry = {x["file_name"]: x for x in sem_seg_dicts}
assert len(sem_seg_file_to_entry) > 0
for det_dict in detection_dicts:
dic = copy.copy(det_dict)
dic.update(sem_seg_file_to_entry[dic["file_name"]])
results.append(dic)
return results
if __name__ == "__main__":
"""
Test the COCO panoptic dataset loader.
Usage:
python -m detectron2.data.datasets.coco_panoptic \
path/to/image_root path/to/panoptic_root path/to/panoptic_json dataset_name 10
"dataset_name" can be "coco_2017_train_panoptic", or other
pre-registered ones
"""
from detectron2.utils.logger import setup_logger
from detectron2.utils.visualizer import Visualizer
import detectron2.data.datasets # noqa # add pre-defined metadata
import sys
from PIL import Image
import numpy as np
logger = setup_logger(name=__name__)
assert sys.argv[4] in DatasetCatalog.list()
meta = MetadataCatalog.get(sys.argv[4])
dicts = load_coco_panoptic_json(sys.argv[3], sys.argv[1], sys.argv[2], meta.as_dict())
logger.info("Done loading {} samples.".format(len(dicts)))
dirname = "coco-data-vis"
os.makedirs(dirname, exist_ok=True)
num_imgs_to_vis = int(sys.argv[5])
for i, d in enumerate(dicts):
img = np.array(Image.open(d["file_name"]))
visualizer = Visualizer(img, metadata=meta)
vis = visualizer.draw_dataset_dict(d)
fpath = os.path.join(dirname, os.path.basename(d["file_name"]))
vis.save(fpath)
if i + 1 >= num_imgs_to_vis:
break
# Copyright (c) Facebook, Inc. and its affiliates.
import logging
import os
from fvcore.common.timer import Timer
from detectron2.data import DatasetCatalog, MetadataCatalog
from detectron2.structures import BoxMode
from detectron2.utils.file_io import PathManager
from .builtin_meta import _get_coco_instances_meta
from .lvis_v0_5_categories import LVIS_CATEGORIES as LVIS_V0_5_CATEGORIES
from .lvis_v1_categories import LVIS_CATEGORIES as LVIS_V1_CATEGORIES
"""
This file contains functions to parse LVIS-format annotations into dicts in the
"Detectron2 format".
"""
logger = logging.getLogger(__name__)
__all__ = ["load_lvis_json", "register_lvis_instances", "get_lvis_instances_meta"]
def register_lvis_instances(name, metadata, json_file, image_root):
"""
Register a dataset in LVIS's json annotation format for instance detection and segmentation.
Args:
name (str): a name that identifies the dataset, e.g. "lvis_v0.5_train".
metadata (dict): extra metadata associated with this dataset. It can be an empty dict.
json_file (str): path to the json instance annotation file.
image_root (str or path-like): directory which contains all the images.
"""
DatasetCatalog.register(name, lambda: load_lvis_json(json_file, image_root, name))
MetadataCatalog.get(name).set(
json_file=json_file, image_root=image_root, evaluator_type="lvis", **metadata
)
def load_lvis_json(json_file, image_root, dataset_name=None, extra_annotation_keys=None):
"""
Load a json file in LVIS's annotation format.
Args:
json_file (str): full path to the LVIS json annotation file.
image_root (str): the directory where the images in this json file exists.
dataset_name (str): the name of the dataset (e.g., "lvis_v0.5_train").
If provided, this function will put "thing_classes" into the metadata
associated with this dataset.
extra_annotation_keys (list[str]): list of per-annotation keys that should also be
loaded into the dataset dict (besides "bbox", "bbox_mode", "category_id",
"segmentation"). The values for these keys will be returned as-is.
Returns:
list[dict]: a list of dicts in Detectron2 standard format. (See
`Using Custom Datasets </tutorials/datasets.html>`_ )
Notes:
1. This function does not read the image files.
The results do not have the "image" field.
"""
from lvis import LVIS
json_file = PathManager.get_local_path(json_file)
timer = Timer()
lvis_api = LVIS(json_file)
if timer.seconds() > 1:
logger.info("Loading {} takes {:.2f} seconds.".format(json_file, timer.seconds()))
if dataset_name is not None:
meta = get_lvis_instances_meta(dataset_name)
MetadataCatalog.get(dataset_name).set(**meta)
# sort indices for reproducible results
img_ids = sorted(lvis_api.imgs.keys())
# imgs is a list of dicts, each looks something like:
# {'license': 4,
# 'url': 'http://farm6.staticflickr.com/5454/9413846304_881d5e5c3b_z.jpg',
# 'file_name': 'COCO_val2014_000000001268.jpg',
# 'height': 427,
# 'width': 640,
# 'date_captured': '2013-11-17 05:57:24',
# 'id': 1268}
imgs = lvis_api.load_imgs(img_ids)
# anns is a list[list[dict]], where each dict is an annotation
# record for an object. The inner list enumerates the objects in an image
# and the outer list enumerates over images. Example of anns[0]:
# [{'segmentation': [[192.81,
# 247.09,
# ...
# 219.03,
# 249.06]],
# 'area': 1035.749,
# 'image_id': 1268,
# 'bbox': [192.81, 224.8, 74.73, 33.43],
# 'category_id': 16,
# 'id': 42986},
# ...]
anns = [lvis_api.img_ann_map[img_id] for img_id in img_ids]
# Sanity check that each annotation has a unique id
ann_ids = [ann["id"] for anns_per_image in anns for ann in anns_per_image]
assert len(set(ann_ids)) == len(ann_ids), "Annotation ids in '{}' are not unique".format(
json_file
)
imgs_anns = list(zip(imgs, anns))
logger.info("Loaded {} images in the LVIS format from {}".format(len(imgs_anns), json_file))
if extra_annotation_keys:
logger.info(
"The following extra annotation keys will be loaded: {} ".format(extra_annotation_keys)
)
else:
extra_annotation_keys = []
def get_file_name(img_root, img_dict):
# Determine the path including the split folder ("train2017", "val2017", "test2017") from
# the coco_url field. Example:
# 'coco_url': 'http://images.cocodataset.org/train2017/000000155379.jpg'
split_folder, file_name = img_dict["coco_url"].split("/")[-2:]
return os.path.join(img_root + split_folder, file_name)
dataset_dicts = []
for (img_dict, anno_dict_list) in imgs_anns:
record = {}
record["file_name"] = get_file_name(image_root, img_dict)
record["height"] = img_dict["height"]
record["width"] = img_dict["width"]
record["not_exhaustive_category_ids"] = img_dict.get("not_exhaustive_category_ids", [])
record["neg_category_ids"] = img_dict.get("neg_category_ids", [])
image_id = record["image_id"] = img_dict["id"]
objs = []
for anno in anno_dict_list:
# Check that the image_id in this annotation is the same as
# the image_id we're looking at.
# This fails only when the data parsing logic or the annotation file is buggy.
assert anno["image_id"] == image_id
obj = {"bbox": anno["bbox"], "bbox_mode": BoxMode.XYWH_ABS}
# LVIS data loader can be used to load COCO dataset categories. In this case `meta`
# variable will have a field with COCO-specific category mapping.
if dataset_name is not None and "thing_dataset_id_to_contiguous_id" in meta:
obj["category_id"] = meta["thing_dataset_id_to_contiguous_id"][anno["category_id"]]
else:
obj["category_id"] = anno["category_id"] - 1 # Convert 1-indexed to 0-indexed
segm = anno["segmentation"] # list[list[float]]
# filter out invalid polygons (< 3 points)
valid_segm = [poly for poly in segm if len(poly) % 2 == 0 and len(poly) >= 6]
assert len(segm) == len(
valid_segm
), "Annotation contains an invalid polygon with < 3 points"
assert len(segm) > 0
obj["segmentation"] = segm
for extra_ann_key in extra_annotation_keys:
obj[extra_ann_key] = anno[extra_ann_key]
objs.append(obj)
record["annotations"] = objs
dataset_dicts.append(record)
return dataset_dicts
def get_lvis_instances_meta(dataset_name):
"""
Load LVIS metadata.
Args:
dataset_name (str): LVIS dataset name without the split name (e.g., "lvis_v0.5").
Returns:
dict: LVIS metadata with keys: thing_classes
"""
if "cocofied" in dataset_name:
return _get_coco_instances_meta()
if "v0.5" in dataset_name:
return _get_lvis_instances_meta_v0_5()
elif "v1" in dataset_name:
return _get_lvis_instances_meta_v1()
raise ValueError("No built-in metadata for dataset {}".format(dataset_name))
def _get_lvis_instances_meta_v0_5():
assert len(LVIS_V0_5_CATEGORIES) == 1230
cat_ids = [k["id"] for k in LVIS_V0_5_CATEGORIES]
assert min(cat_ids) == 1 and max(cat_ids) == len(
cat_ids
), "Category ids are not in [1, #categories], as expected"
# Ensure that the category list is sorted by id
lvis_categories = sorted(LVIS_V0_5_CATEGORIES, key=lambda x: x["id"])
thing_classes = [k["synonyms"][0] for k in lvis_categories]
meta = {"thing_classes": thing_classes}
return meta
def _get_lvis_instances_meta_v1():
assert len(LVIS_V1_CATEGORIES) == 1203
cat_ids = [k["id"] for k in LVIS_V1_CATEGORIES]
assert min(cat_ids) == 1 and max(cat_ids) == len(
cat_ids
), "Category ids are not in [1, #categories], as expected"
# Ensure that the category list is sorted by id
lvis_categories = sorted(LVIS_V1_CATEGORIES, key=lambda x: x["id"])
thing_classes = [k["synonyms"][0] for k in lvis_categories]
meta = {"thing_classes": thing_classes}
return meta
if __name__ == "__main__":
"""
Test the LVIS json dataset loader.
Usage:
python -m detectron2.data.datasets.lvis \
path/to/json path/to/image_root dataset_name vis_limit
"""
import sys
import numpy as np
from detectron2.utils.logger import setup_logger
from PIL import Image
import detectron2.data.datasets # noqa # add pre-defined metadata
from detectron2.utils.visualizer import Visualizer
logger = setup_logger(name=__name__)
meta = MetadataCatalog.get(sys.argv[3])
dicts = load_lvis_json(sys.argv[1], sys.argv[2], sys.argv[3])
logger.info("Done loading {} samples.".format(len(dicts)))
dirname = "lvis-data-vis"
os.makedirs(dirname, exist_ok=True)
for d in dicts[: int(sys.argv[4])]:
img = np.array(Image.open(d["file_name"]))
visualizer = Visualizer(img, metadata=meta)
vis = visualizer.draw_dataset_dict(d)
fpath = os.path.join(dirname, os.path.basename(d["file_name"]))
vis.save(fpath)
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
# -*- coding: utf-8 -*-
# Copyright (c) Facebook, Inc. and its affiliates.
import numpy as np
import os
import xml.etree.ElementTree as ET
from typing import List, Tuple, Union
from detectron2.data import DatasetCatalog, MetadataCatalog
from detectron2.structures import BoxMode
from detectron2.utils.file_io import PathManager
__all__ = ["load_voc_instances", "register_pascal_voc"]
# fmt: off
CLASS_NAMES = (
"aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat",
"chair", "cow", "diningtable", "dog", "horse", "motorbike", "person",
"pottedplant", "sheep", "sofa", "train", "tvmonitor"
)
# fmt: on
def load_voc_instances(dirname: str, split: str, class_names: Union[List[str], Tuple[str, ...]]):
"""
Load Pascal VOC detection annotations to Detectron2 format.
Args:
dirname: Contain "Annotations", "ImageSets", "JPEGImages"
split (str): one of "train", "test", "val", "trainval"
class_names: list or tuple of class names
"""
with PathManager.open(os.path.join(dirname, "ImageSets", "Main", split + ".txt")) as f:
fileids = np.loadtxt(f, dtype=np.str)
# Needs to read many small annotation files. Makes sense at local
annotation_dirname = PathManager.get_local_path(os.path.join(dirname, "Annotations/"))
dicts = []
for fileid in fileids:
anno_file = os.path.join(annotation_dirname, fileid + ".xml")
jpeg_file = os.path.join(dirname, "JPEGImages", fileid + ".jpg")
with PathManager.open(anno_file) as f:
tree = ET.parse(f)
r = {
"file_name": jpeg_file,
"image_id": fileid,
"height": int(tree.findall("./size/height")[0].text),
"width": int(tree.findall("./size/width")[0].text),
}
instances = []
for obj in tree.findall("object"):
cls = obj.find("name").text
# We include "difficult" samples in training.
# Based on limited experiments, they don't hurt accuracy.
# difficult = int(obj.find("difficult").text)
# if difficult == 1:
# continue
bbox = obj.find("bndbox")
bbox = [float(bbox.find(x).text) for x in ["xmin", "ymin", "xmax", "ymax"]]
# Original annotations are integers in the range [1, W or H]
# Assuming they mean 1-based pixel indices (inclusive),
# a box with annotation (xmin=1, xmax=W) covers the whole image.
# In coordinate space this is represented by (xmin=0, xmax=W)
bbox[0] -= 1.0
bbox[1] -= 1.0
instances.append(
{"category_id": class_names.index(cls), "bbox": bbox, "bbox_mode": BoxMode.XYXY_ABS}
)
r["annotations"] = instances
dicts.append(r)
return dicts
def register_pascal_voc(name, dirname, split, year, class_names=CLASS_NAMES):
DatasetCatalog.register(name, lambda: load_voc_instances(dirname, split, class_names))
MetadataCatalog.get(name).set(
thing_classes=list(class_names), dirname=dirname, year=year, split=split
)
# Copyright (c) Facebook, Inc. and its affiliates.
from .coco import register_coco_instances # noqa
from .coco_panoptic import register_coco_panoptic_separated # noqa
# -*- coding: utf-8 -*-
# Copyright (c) Facebook, Inc. and its affiliates.
"""
Common data processing utilities that are used in a
typical object detection data pipeline.
"""
import logging
import numpy as np
from typing import List, Union
import pycocotools.mask as mask_util
import torch
from PIL import Image
from detectron2.structures import (
BitMasks,
Boxes,
BoxMode,
Instances,
Keypoints,
PolygonMasks,
RotatedBoxes,
polygons_to_bitmask,
)
from detectron2.utils.file_io import PathManager
from . import transforms as T
from .catalog import MetadataCatalog
__all__ = [
"SizeMismatchError",
"convert_image_to_rgb",
"check_image_size",
"transform_proposals",
"transform_instance_annotations",
"annotations_to_instances",
"annotations_to_instances_rotated",
"build_augmentation",
"build_transform_gen",
"create_keypoint_hflip_indices",
"filter_empty_instances",
"read_image",
]
class SizeMismatchError(ValueError):
"""
When loaded image has difference width/height compared with annotation.
"""
# https://en.wikipedia.org/wiki/YUV#SDTV_with_BT.601
_M_RGB2YUV = [[0.299, 0.587, 0.114], [-0.14713, -0.28886, 0.436], [0.615, -0.51499, -0.10001]]
_M_YUV2RGB = [[1.0, 0.0, 1.13983], [1.0, -0.39465, -0.58060], [1.0, 2.03211, 0.0]]
# https://www.exiv2.org/tags.html
_EXIF_ORIENT = 274 # exif 'Orientation' tag
def convert_PIL_to_numpy(image, format):
"""
Convert PIL image to numpy array of target format.
Args:
image (PIL.Image): a PIL image
format (str): the format of output image
Returns:
(np.ndarray): also see `read_image`
"""
if format is not None:
# PIL only supports RGB, so convert to RGB and flip channels over below
conversion_format = format
if format in ["BGR", "YUV-BT.601"]:
conversion_format = "RGB"
image = image.convert(conversion_format)
image = np.asarray(image)
# PIL squeezes out the channel dimension for "L", so make it HWC
if format == "L":
image = np.expand_dims(image, -1)
# handle formats not supported by PIL
elif format == "BGR":
# flip channels if needed
image = image[:, :, ::-1]
elif format == "YUV-BT.601":
image = image / 255.0
image = np.dot(image, np.array(_M_RGB2YUV).T)
return image
def convert_image_to_rgb(image, format):
"""
Convert an image from given format to RGB.
Args:
image (np.ndarray or Tensor): an HWC image
format (str): the format of input image, also see `read_image`
Returns:
(np.ndarray): (H,W,3) RGB image in 0-255 range, can be either float or uint8
"""
if isinstance(image, torch.Tensor):
image = image.cpu().numpy()
if format == "BGR":
image = image[:, :, [2, 1, 0]]
elif format == "YUV-BT.601":
image = np.dot(image, np.array(_M_YUV2RGB).T)
image = image * 255.0
else:
if format == "L":
image = image[:, :, 0]
image = image.astype(np.uint8)
image = np.asarray(Image.fromarray(image, mode=format).convert("RGB"))
return image
def _apply_exif_orientation(image):
"""
Applies the exif orientation correctly.
This code exists per the bug:
https://github.com/python-pillow/Pillow/issues/3973
with the function `ImageOps.exif_transpose`. The Pillow source raises errors with
various methods, especially `tobytes`
Function based on:
https://github.com/wkentaro/labelme/blob/v4.5.4/labelme/utils/image.py#L59
https://github.com/python-pillow/Pillow/blob/7.1.2/src/PIL/ImageOps.py#L527
Args:
image (PIL.Image): a PIL image
Returns:
(PIL.Image): the PIL image with exif orientation applied, if applicable
"""
if not hasattr(image, "getexif"):
return image
try:
exif = image.getexif()
except Exception: # https://github.com/facebookresearch/detectron2/issues/1885
exif = None
if exif is None:
return image
orientation = exif.get(_EXIF_ORIENT)
method = {
2: Image.FLIP_LEFT_RIGHT,
3: Image.ROTATE_180,
4: Image.FLIP_TOP_BOTTOM,
5: Image.TRANSPOSE,
6: Image.ROTATE_270,
7: Image.TRANSVERSE,
8: Image.ROTATE_90,
}.get(orientation)
if method is not None:
return image.transpose(method)
return image
def read_image(file_name, format=None):
"""
Read an image into the given format.
Will apply rotation and flipping if the image has such exif information.
Args:
file_name (str): image file path
format (str): one of the supported image modes in PIL, or "BGR" or "YUV-BT.601".
Returns:
image (np.ndarray):
an HWC image in the given format, which is 0-255, uint8 for
supported image modes in PIL or "BGR"; float (0-1 for Y) for YUV-BT.601.
"""
with PathManager.open(file_name, "rb") as f:
image = Image.open(f)
# work around this bug: https://github.com/python-pillow/Pillow/issues/3973
image = _apply_exif_orientation(image)
return convert_PIL_to_numpy(image, format)
def check_image_size(dataset_dict, image):
"""
Raise an error if the image does not match the size specified in the dict.
"""
if "width" in dataset_dict or "height" in dataset_dict:
image_wh = (image.shape[1], image.shape[0])
expected_wh = (dataset_dict["width"], dataset_dict["height"])
if not image_wh == expected_wh:
raise SizeMismatchError(
"Mismatched image shape{}, got {}, expect {}.".format(
" for image " + dataset_dict["file_name"]
if "file_name" in dataset_dict
else "",
image_wh,
expected_wh,
)
+ " Please check the width/height in your annotation."
)
# To ensure bbox always remap to original image size
if "width" not in dataset_dict:
dataset_dict["width"] = image.shape[1]
if "height" not in dataset_dict:
dataset_dict["height"] = image.shape[0]
def transform_proposals(dataset_dict, image_shape, transforms, *, proposal_topk, min_box_size=0):
"""
Apply transformations to the proposals in dataset_dict, if any.
Args:
dataset_dict (dict): a dict read from the dataset, possibly
contains fields "proposal_boxes", "proposal_objectness_logits", "proposal_bbox_mode"
image_shape (tuple): height, width
transforms (TransformList):
proposal_topk (int): only keep top-K scoring proposals
min_box_size (int): proposals with either side smaller than this
threshold are removed
The input dict is modified in-place, with abovementioned keys removed. A new
key "proposals" will be added. Its value is an `Instances`
object which contains the transformed proposals in its field
"proposal_boxes" and "objectness_logits".
"""
if "proposal_boxes" in dataset_dict:
# Transform proposal boxes
boxes = transforms.apply_box(
BoxMode.convert(
dataset_dict.pop("proposal_boxes"),
dataset_dict.pop("proposal_bbox_mode"),
BoxMode.XYXY_ABS,
)
)
boxes = Boxes(boxes)
objectness_logits = torch.as_tensor(
dataset_dict.pop("proposal_objectness_logits").astype("float32")
)
boxes.clip(image_shape)
keep = boxes.nonempty(threshold=min_box_size)
boxes = boxes[keep]
objectness_logits = objectness_logits[keep]
proposals = Instances(image_shape)
proposals.proposal_boxes = boxes[:proposal_topk]
proposals.objectness_logits = objectness_logits[:proposal_topk]
dataset_dict["proposals"] = proposals
def transform_instance_annotations(
annotation, transforms, image_size, *, keypoint_hflip_indices=None
):
"""
Apply transforms to box, segmentation and keypoints annotations of a single instance.
It will use `transforms.apply_box` for the box, and
`transforms.apply_coords` for segmentation polygons & keypoints.
If you need anything more specially designed for each data structure,
you'll need to implement your own version of this function or the transforms.
Args:
annotation (dict): dict of instance annotations for a single instance.
It will be modified in-place.
transforms (TransformList or list[Transform]):
image_size (tuple): the height, width of the transformed image
keypoint_hflip_indices (ndarray[int]): see `create_keypoint_hflip_indices`.
Returns:
dict:
the same input dict with fields "bbox", "segmentation", "keypoints"
transformed according to `transforms`.
The "bbox_mode" field will be set to XYXY_ABS.
"""
if isinstance(transforms, (tuple, list)):
transforms = T.TransformList(transforms)
# bbox is 1d (per-instance bounding box)
bbox = BoxMode.convert(annotation["bbox"], annotation["bbox_mode"], BoxMode.XYXY_ABS)
# clip transformed bbox to image size
bbox = transforms.apply_box(np.array([bbox]))[0].clip(min=0)
annotation["bbox"] = np.minimum(bbox, list(image_size + image_size)[::-1])
annotation["bbox_mode"] = BoxMode.XYXY_ABS
if "segmentation" in annotation:
# each instance contains 1 or more polygons
segm = annotation["segmentation"]
if isinstance(segm, list):
# polygons
polygons = [np.asarray(p).reshape(-1, 2) for p in segm]
annotation["segmentation"] = [
p.reshape(-1) for p in transforms.apply_polygons(polygons)
]
elif isinstance(segm, dict):
# RLE
mask = mask_util.decode(segm)
mask = transforms.apply_segmentation(mask)
assert tuple(mask.shape[:2]) == image_size
annotation["segmentation"] = mask
else:
raise ValueError(
"Cannot transform segmentation of type '{}'!"
"Supported types are: polygons as list[list[float] or ndarray],"
" COCO-style RLE as a dict.".format(type(segm))
)
if "keypoints" in annotation:
keypoints = transform_keypoint_annotations(
annotation["keypoints"], transforms, image_size, keypoint_hflip_indices
)
annotation["keypoints"] = keypoints
return annotation
def transform_keypoint_annotations(keypoints, transforms, image_size, keypoint_hflip_indices=None):
"""
Transform keypoint annotations of an image.
If a keypoint is transformed out of image boundary, it will be marked "unlabeled" (visibility=0)
Args:
keypoints (list[float]): Nx3 float in Detectron2's Dataset format.
Each point is represented by (x, y, visibility).
transforms (TransformList):
image_size (tuple): the height, width of the transformed image
keypoint_hflip_indices (ndarray[int]): see `create_keypoint_hflip_indices`.
When `transforms` includes horizontal flip, will use the index
mapping to flip keypoints.
"""
# (N*3,) -> (N, 3)
keypoints = np.asarray(keypoints, dtype="float64").reshape(-1, 3)
keypoints_xy = transforms.apply_coords(keypoints[:, :2])
# Set all out-of-boundary points to "unlabeled"
inside = (keypoints_xy >= np.array([0, 0])) & (keypoints_xy <= np.array(image_size[::-1]))
inside = inside.all(axis=1)
keypoints[:, :2] = keypoints_xy
keypoints[:, 2][~inside] = 0
# This assumes that HorizFlipTransform is the only one that does flip
do_hflip = sum(isinstance(t, T.HFlipTransform) for t in transforms.transforms) % 2 == 1
# Alternative way: check if probe points was horizontally flipped.
# probe = np.asarray([[0.0, 0.0], [image_width, 0.0]])
# probe_aug = transforms.apply_coords(probe.copy())
# do_hflip = np.sign(probe[1][0] - probe[0][0]) != np.sign(probe_aug[1][0] - probe_aug[0][0]) # noqa
# If flipped, swap each keypoint with its opposite-handed equivalent
if do_hflip:
if keypoint_hflip_indices is None:
raise ValueError("Cannot flip keypoints without providing flip indices!")
if len(keypoints) != len(keypoint_hflip_indices):
raise ValueError(
"Keypoint data has {} points, but metadata "
"contains {} points!".format(len(keypoints), len(keypoint_hflip_indices))
)
keypoints = keypoints[np.asarray(keypoint_hflip_indices, dtype=np.int32), :]
# Maintain COCO convention that if visibility == 0 (unlabeled), then x, y = 0
keypoints[keypoints[:, 2] == 0] = 0
return keypoints
def annotations_to_instances(annos, image_size, mask_format="polygon"):
"""
Create an :class:`Instances` object used by the models,
from instance annotations in the dataset dict.
Args:
annos (list[dict]): a list of instance annotations in one image, each
element for one instance.
image_size (tuple): height, width
Returns:
Instances:
It will contain fields "gt_boxes", "gt_classes",
"gt_masks", "gt_keypoints", if they can be obtained from `annos`.
This is the format that builtin models expect.
"""
boxes = (
np.stack(
[BoxMode.convert(obj["bbox"], obj["bbox_mode"], BoxMode.XYXY_ABS) for obj in annos]
)
if len(annos)
else np.zeros((0, 4))
)
target = Instances(image_size)
target.gt_boxes = Boxes(boxes)
classes = [int(obj["category_id"]) for obj in annos]
classes = torch.tensor(classes, dtype=torch.int64)
target.gt_classes = classes
if len(annos) and "segmentation" in annos[0]:
segms = [obj["segmentation"] for obj in annos]
if mask_format == "polygon":
try:
masks = PolygonMasks(segms)
except ValueError as e:
raise ValueError(
"Failed to use mask_format=='polygon' from the given annotations!"
) from e
else:
assert mask_format == "bitmask", mask_format
masks = []
for segm in segms:
if isinstance(segm, list):
# polygon
masks.append(polygons_to_bitmask(segm, *image_size))
elif isinstance(segm, dict):
# COCO RLE
masks.append(mask_util.decode(segm))
elif isinstance(segm, np.ndarray):
assert segm.ndim == 2, "Expect segmentation of 2 dimensions, got {}.".format(
segm.ndim
)
# mask array
masks.append(segm)
else:
raise ValueError(
"Cannot convert segmentation of type '{}' to BitMasks!"
"Supported types are: polygons as list[list[float] or ndarray],"
" COCO-style RLE as a dict, or a binary segmentation mask "
" in a 2D numpy array of shape HxW.".format(type(segm))
)
# torch.from_numpy does not support array with negative stride.
masks = BitMasks(
torch.stack([torch.from_numpy(np.ascontiguousarray(x)) for x in masks])
)
target.gt_masks = masks
if len(annos) and "keypoints" in annos[0]:
kpts = [obj.get("keypoints", []) for obj in annos]
target.gt_keypoints = Keypoints(kpts)
return target
def annotations_to_instances_rotated(annos, image_size):
"""
Create an :class:`Instances` object used by the models,
from instance annotations in the dataset dict.
Compared to `annotations_to_instances`, this function is for rotated boxes only
Args:
annos (list[dict]): a list of instance annotations in one image, each
element for one instance.
image_size (tuple): height, width
Returns:
Instances:
Containing fields "gt_boxes", "gt_classes",
if they can be obtained from `annos`.
This is the format that builtin models expect.
"""
boxes = [obj["bbox"] for obj in annos]
target = Instances(image_size)
boxes = target.gt_boxes = RotatedBoxes(boxes)
boxes.clip(image_size)
classes = [obj["category_id"] for obj in annos]
classes = torch.tensor(classes, dtype=torch.int64)
target.gt_classes = classes
return target
def filter_empty_instances(
instances, by_box=True, by_mask=True, box_threshold=1e-5, return_mask=False
):
"""
Filter out empty instances in an `Instances` object.
Args:
instances (Instances):
by_box (bool): whether to filter out instances with empty boxes
by_mask (bool): whether to filter out instances with empty masks
box_threshold (float): minimum width and height to be considered non-empty
return_mask (bool): whether to return boolean mask of filtered instances
Returns:
Instances: the filtered instances.
tensor[bool], optional: boolean mask of filtered instances
"""
assert by_box or by_mask
r = []
if by_box:
r.append(instances.gt_boxes.nonempty(threshold=box_threshold))
if instances.has("gt_masks") and by_mask:
r.append(instances.gt_masks.nonempty())
# TODO: can also filter visible keypoints
if not r:
return instances
m = r[0]
for x in r[1:]:
m = m & x
if return_mask:
return instances[m], m
return instances[m]
def create_keypoint_hflip_indices(dataset_names: Union[str, List[str]]) -> List[int]:
"""
Args:
dataset_names: list of dataset names
Returns:
list[int]: a list of size=#keypoints, storing the
horizontally-flipped keypoint indices.
"""
if isinstance(dataset_names, str):
dataset_names = [dataset_names]
check_metadata_consistency("keypoint_names", dataset_names)
check_metadata_consistency("keypoint_flip_map", dataset_names)
meta = MetadataCatalog.get(dataset_names[0])
names = meta.keypoint_names
# TODO flip -> hflip
flip_map = dict(meta.keypoint_flip_map)
flip_map.update({v: k for k, v in flip_map.items()})
flipped_names = [i if i not in flip_map else flip_map[i] for i in names]
flip_indices = [names.index(i) for i in flipped_names]
return flip_indices
def gen_crop_transform_with_instance(crop_size, image_size, instance):
"""
Generate a CropTransform so that the cropping region contains
the center of the given instance.
Args:
crop_size (tuple): h, w in pixels
image_size (tuple): h, w
instance (dict): an annotation dict of one instance, in Detectron2's
dataset format.
"""
crop_size = np.asarray(crop_size, dtype=np.int32)
bbox = BoxMode.convert(instance["bbox"], instance["bbox_mode"], BoxMode.XYXY_ABS)
center_yx = (bbox[1] + bbox[3]) * 0.5, (bbox[0] + bbox[2]) * 0.5
assert (
image_size[0] >= center_yx[0] and image_size[1] >= center_yx[1]
), "The annotation bounding box is outside of the image!"
assert (
image_size[0] >= crop_size[0] and image_size[1] >= crop_size[1]
), "Crop size is larger than image size!"
min_yx = np.maximum(np.floor(center_yx).astype(np.int32) - crop_size, 0)
max_yx = np.maximum(np.asarray(image_size, dtype=np.int32) - crop_size, 0)
max_yx = np.minimum(max_yx, np.ceil(center_yx).astype(np.int32))
y0 = np.random.randint(min_yx[0], max_yx[0] + 1)
x0 = np.random.randint(min_yx[1], max_yx[1] + 1)
return T.CropTransform(x0, y0, crop_size[1], crop_size[0])
def check_metadata_consistency(key, dataset_names):
"""
Check that the datasets have consistent metadata.
Args:
key (str): a metadata key
dataset_names (list[str]): a list of dataset names
Raises:
AttributeError: if the key does not exist in the metadata
ValueError: if the given datasets do not have the same metadata values defined by key
"""
if len(dataset_names) == 0:
return
logger = logging.getLogger(__name__)
entries_per_dataset = [getattr(MetadataCatalog.get(d), key) for d in dataset_names]
for idx, entry in enumerate(entries_per_dataset):
if entry != entries_per_dataset[0]:
logger.error(
"Metadata '{}' for dataset '{}' is '{}'".format(key, dataset_names[idx], str(entry))
)
logger.error(
"Metadata '{}' for dataset '{}' is '{}'".format(
key, dataset_names[0], str(entries_per_dataset[0])
)
)
raise ValueError("Datasets have different metadata '{}'!".format(key))
def build_augmentation(cfg, is_train):
"""
Create a list of default :class:`Augmentation` from config.
Now it includes resizing and flipping.
Returns:
list[Augmentation]
"""
if is_train:
min_size = cfg.INPUT.MIN_SIZE_TRAIN
max_size = cfg.INPUT.MAX_SIZE_TRAIN
sample_style = cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING
else:
min_size = cfg.INPUT.MIN_SIZE_TEST
max_size = cfg.INPUT.MAX_SIZE_TEST
sample_style = "choice"
augmentation = [T.ResizeShortestEdge(min_size, max_size, sample_style)]
if is_train and cfg.INPUT.RANDOM_FLIP != "none":
augmentation.append(
T.RandomFlip(
horizontal=cfg.INPUT.RANDOM_FLIP == "horizontal",
vertical=cfg.INPUT.RANDOM_FLIP == "vertical",
)
)
return augmentation
build_transform_gen = build_augmentation
"""
Alias for backward-compatibility.
"""
# Copyright (c) Facebook, Inc. and its affiliates.
from .distributed_sampler import (
InferenceSampler,
RandomSubsetTrainingSampler,
RepeatFactorTrainingSampler,
TrainingSampler,
)
from .grouped_batch_sampler import GroupedBatchSampler
__all__ = [
"GroupedBatchSampler",
"TrainingSampler",
"RandomSubsetTrainingSampler",
"InferenceSampler",
"RepeatFactorTrainingSampler",
]
# Copyright (c) Facebook, Inc. and its affiliates.
import itertools
import logging
import math
from collections import defaultdict
from typing import Optional
import torch
from torch.utils.data.sampler import Sampler
from detectron2.utils import comm
logger = logging.getLogger(__name__)
class TrainingSampler(Sampler):
"""
In training, we only care about the "infinite stream" of training data.
So this sampler produces an infinite stream of indices and
all workers cooperate to correctly shuffle the indices and sample different indices.
The samplers in each worker effectively produces `indices[worker_id::num_workers]`
where `indices` is an infinite stream of indices consisting of
`shuffle(range(size)) + shuffle(range(size)) + ...` (if shuffle is True)
or `range(size) + range(size) + ...` (if shuffle is False)
Note that this sampler does not shard based on pytorch DataLoader worker id.
A sampler passed to pytorch DataLoader is used only with map-style dataset
and will not be executed inside workers.
But if this sampler is used in a way that it gets execute inside a dataloader
worker, then extra work needs to be done to shard its outputs based on worker id.
This is required so that workers don't produce identical data.
:class:`ToIterableDataset` implements this logic.
This note is true for all samplers in detectron2.
"""
def __init__(self, size: int, shuffle: bool = True, seed: Optional[int] = None):
"""
Args:
size (int): the total number of data of the underlying dataset to sample from
shuffle (bool): whether to shuffle the indices or not
seed (int): the initial seed of the shuffle. Must be the same
across all workers. If None, will use a random seed shared
among workers (require synchronization among all workers).
"""
if not isinstance(size, int):
raise TypeError(f"TrainingSampler(size=) expects an int. Got type {type(size)}.")
if size <= 0:
raise ValueError(f"TrainingSampler(size=) expects a positive int. Got {size}.")
self._size = size
self._shuffle = shuffle
if seed is None:
seed = comm.shared_random_seed()
self._seed = int(seed)
self._rank = comm.get_rank()
self._world_size = comm.get_world_size()
def __iter__(self):
start = self._rank
yield from itertools.islice(self._infinite_indices(), start, None, self._world_size)
def _infinite_indices(self):
g = torch.Generator()
g.manual_seed(self._seed)
while True:
if self._shuffle:
yield from torch.randperm(self._size, generator=g).tolist()
else:
yield from torch.arange(self._size).tolist()
class RandomSubsetTrainingSampler(TrainingSampler):
"""
Similar to TrainingSampler, but only sample a random subset of indices.
This is useful when you want to estimate the accuracy vs data-number curves by
training the model with different subset_ratio.
"""
def __init__(
self,
size: int,
subset_ratio: float,
shuffle: bool = True,
seed_shuffle: Optional[int] = None,
seed_subset: Optional[int] = None,
):
"""
Args:
size (int): the total number of data of the underlying dataset to sample from
subset_ratio (float): the ratio of subset data to sample from the underlying dataset
shuffle (bool): whether to shuffle the indices or not
seed_shuffle (int): the initial seed of the shuffle. Must be the same
across all workers. If None, will use a random seed shared
among workers (require synchronization among all workers).
seed_subset (int): the seed to randomize the subset to be sampled.
Must be the same across all workers. If None, will use a random seed shared
among workers (require synchronization among all workers).
"""
super().__init__(size=size, shuffle=shuffle, seed=seed_shuffle)
assert 0.0 < subset_ratio <= 1.0
self._size_subset = int(size * subset_ratio)
assert self._size_subset > 0
if seed_subset is None:
seed_subset = comm.shared_random_seed()
self._seed_subset = int(seed_subset)
# randomly generate the subset indexes to be sampled from
g = torch.Generator()
g.manual_seed(self._seed_subset)
indexes_randperm = torch.randperm(self._size, generator=g)
self._indexes_subset = indexes_randperm[: self._size_subset]
logger.info("Using RandomSubsetTrainingSampler......")
logger.info(f"Randomly sample {self._size_subset} data from the original {self._size} data")
def _infinite_indices(self):
g = torch.Generator()
g.manual_seed(self._seed) # self._seed equals seed_shuffle from __init__()
while True:
if self._shuffle:
# generate a random permutation to shuffle self._indexes_subset
randperm = torch.randperm(self._size_subset, generator=g)
yield from self._indexes_subset[randperm].tolist()
else:
yield from self._indexes_subset.tolist()
class RepeatFactorTrainingSampler(Sampler):
"""
Similar to TrainingSampler, but a sample may appear more times than others based
on its "repeat factor". This is suitable for training on class imbalanced datasets like LVIS.
"""
def __init__(self, repeat_factors, *, shuffle=True, seed=None):
"""
Args:
repeat_factors (Tensor): a float vector, the repeat factor for each indice. When it's
full of ones, it is equivalent to ``TrainingSampler(len(repeat_factors), ...)``.
shuffle (bool): whether to shuffle the indices or not
seed (int): the initial seed of the shuffle. Must be the same
across all workers. If None, will use a random seed shared
among workers (require synchronization among all workers).
"""
self._shuffle = shuffle
if seed is None:
seed = comm.shared_random_seed()
self._seed = int(seed)
self._rank = comm.get_rank()
self._world_size = comm.get_world_size()
# Split into whole number (_int_part) and fractional (_frac_part) parts.
self._int_part = torch.trunc(repeat_factors)
self._frac_part = repeat_factors - self._int_part
@staticmethod
def repeat_factors_from_category_frequency(dataset_dicts, repeat_thresh):
"""
Compute (fractional) per-image repeat factors based on category frequency.
The repeat factor for an image is a function of the frequency of the rarest
category labeled in that image. The "frequency of category c" in [0, 1] is defined
as the fraction of images in the training set (without repeats) in which category c
appears.
See :paper:`lvis` (>= v2) Appendix B.2.
Args:
dataset_dicts (list[dict]): annotations in Detectron2 dataset format.
repeat_thresh (float): frequency threshold below which data is repeated.
If the frequency is half of `repeat_thresh`, the image will be
repeated twice.
Returns:
torch.Tensor:
the i-th element is the repeat factor for the dataset image at index i.
"""
# 1. For each category c, compute the fraction of images that contain it: f(c)
category_freq = defaultdict(int)
for dataset_dict in dataset_dicts: # For each image (without repeats)
cat_ids = {ann["category_id"] for ann in dataset_dict["annotations"]}
for cat_id in cat_ids:
category_freq[cat_id] += 1
num_images = len(dataset_dicts)
for k, v in category_freq.items():
category_freq[k] = v / num_images
# 2. For each category c, compute the category-level repeat factor:
# r(c) = max(1, sqrt(t / f(c)))
category_rep = {
cat_id: max(1.0, math.sqrt(repeat_thresh / cat_freq))
for cat_id, cat_freq in category_freq.items()
}
# 3. For each image I, compute the image-level repeat factor:
# r(I) = max_{c in I} r(c)
rep_factors = []
for dataset_dict in dataset_dicts:
cat_ids = {ann["category_id"] for ann in dataset_dict["annotations"]}
rep_factor = max({category_rep[cat_id] for cat_id in cat_ids}, default=1.0)
rep_factors.append(rep_factor)
return torch.tensor(rep_factors, dtype=torch.float32)
def _get_epoch_indices(self, generator):
"""
Create a list of dataset indices (with repeats) to use for one epoch.
Args:
generator (torch.Generator): pseudo random number generator used for
stochastic rounding.
Returns:
torch.Tensor: list of dataset indices to use in one epoch. Each index
is repeated based on its calculated repeat factor.
"""
# Since repeat factors are fractional, we use stochastic rounding so
# that the target repeat factor is achieved in expectation over the
# course of training
rands = torch.rand(len(self._frac_part), generator=generator)
rep_factors = self._int_part + (rands < self._frac_part).float()
# Construct a list of indices in which we repeat images as specified
indices = []
for dataset_index, rep_factor in enumerate(rep_factors):
indices.extend([dataset_index] * int(rep_factor.item()))
return torch.tensor(indices, dtype=torch.int64)
def __iter__(self):
start = self._rank
yield from itertools.islice(self._infinite_indices(), start, None, self._world_size)
def _infinite_indices(self):
g = torch.Generator()
g.manual_seed(self._seed)
while True:
# Sample indices with repeats determined by stochastic rounding; each
# "epoch" may have a slightly different size due to the rounding.
indices = self._get_epoch_indices(g)
if self._shuffle:
randperm = torch.randperm(len(indices), generator=g)
yield from indices[randperm].tolist()
else:
yield from indices.tolist()
class InferenceSampler(Sampler):
"""
Produce indices for inference across all workers.
Inference needs to run on the __exact__ set of samples,
therefore when the total number of samples is not divisible by the number of workers,
this sampler produces different number of samples on different workers.
"""
def __init__(self, size: int):
"""
Args:
size (int): the total number of data of the underlying dataset to sample from
"""
self._size = size
assert size > 0
self._rank = comm.get_rank()
self._world_size = comm.get_world_size()
self._local_indices = self._get_local_indices(size, self._world_size, self._rank)
@staticmethod
def _get_local_indices(total_size, world_size, rank):
shard_size = total_size // world_size
left = total_size % world_size
shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
begin = sum(shard_sizes[:rank])
end = min(sum(shard_sizes[: rank + 1]), total_size)
return range(begin, end)
def __iter__(self):
yield from self._local_indices
def __len__(self):
return len(self._local_indices)
# Copyright (c) Facebook, Inc. and its affiliates.
import numpy as np
from torch.utils.data.sampler import BatchSampler, Sampler
class GroupedBatchSampler(BatchSampler):
"""
Wraps another sampler to yield a mini-batch of indices.
It enforces that the batch only contain elements from the same group.
It also tries to provide mini-batches which follows an ordering which is
as close as possible to the ordering from the original sampler.
"""
def __init__(self, sampler, group_ids, batch_size):
"""
Args:
sampler (Sampler): Base sampler.
group_ids (list[int]): If the sampler produces indices in range [0, N),
`group_ids` must be a list of `N` ints which contains the group id of each sample.
The group ids must be a set of integers in the range [0, num_groups).
batch_size (int): Size of mini-batch.
"""
if not isinstance(sampler, Sampler):
raise ValueError(
"sampler should be an instance of "
"torch.utils.data.Sampler, but got sampler={}".format(sampler)
)
self.sampler = sampler
self.group_ids = np.asarray(group_ids)
assert self.group_ids.ndim == 1
self.batch_size = batch_size
groups = np.unique(self.group_ids).tolist()
# buffer the indices of each group until batch size is reached
self.buffer_per_group = {k: [] for k in groups}
def __iter__(self):
for idx in self.sampler:
group_id = self.group_ids[idx]
group_buffer = self.buffer_per_group[group_id]
group_buffer.append(idx)
if len(group_buffer) == self.batch_size:
yield group_buffer[:] # yield a copy of the list
del group_buffer[:]
def __len__(self):
raise NotImplementedError("len() of GroupedBatchSampler is not well-defined.")
# Copyright (c) Facebook, Inc. and its affiliates.
from fvcore.transforms.transform import Transform, TransformList # order them first
from fvcore.transforms.transform import *
from .transform import *
from .augmentation import *
from .augmentation_impl import *
__all__ = [k for k in globals().keys() if not k.startswith("_")]
from detectron2.utils.env import fixup_module_metadata
fixup_module_metadata(__name__, globals(), __all__)
del fixup_module_metadata
# -*- coding: utf-8 -*-
# Copyright (c) Facebook, Inc. and its affiliates.
import inspect
import numpy as np
import pprint
from typing import Any, List, Optional, Tuple, Union
from fvcore.transforms.transform import Transform, TransformList
"""
See "Data Augmentation" tutorial for an overview of the system:
https://detectron2.readthedocs.io/tutorials/augmentation.html
"""
__all__ = [
"Augmentation",
"AugmentationList",
"AugInput",
"TransformGen",
"apply_transform_gens",
"StandardAugInput",
"apply_augmentations",
]
def _check_img_dtype(img):
assert isinstance(img, np.ndarray), "[Augmentation] Needs an numpy array, but got a {}!".format(
type(img)
)
assert not isinstance(img.dtype, np.integer) or (
img.dtype == np.uint8
), "[Augmentation] Got image of type {}, use uint8 or floating points instead!".format(
img.dtype
)
assert img.ndim in [2, 3], img.ndim
def _get_aug_input_args(aug, aug_input) -> List[Any]:
"""
Get the arguments to be passed to ``aug.get_transform`` from the input ``aug_input``.
"""
if aug.input_args is None:
# Decide what attributes are needed automatically
prms = list(inspect.signature(aug.get_transform).parameters.items())
# The default behavior is: if there is one parameter, then its "image"
# (work automatically for majority of use cases, and also avoid BC breaking),
# Otherwise, use the argument names.
if len(prms) == 1:
names = ("image",)
else:
names = []
for name, prm in prms:
if prm.kind in (inspect.Parameter.VAR_POSITIONAL, inspect.Parameter.VAR_KEYWORD):
raise TypeError(
f""" \
The default implementation of `{type(aug)}.__call__` does not allow \
`{type(aug)}.get_transform` to use variable-length arguments (*args, **kwargs)! \
If arguments are unknown, reimplement `__call__` instead. \
"""
)
names.append(name)
aug.input_args = tuple(names)
args = []
for f in aug.input_args:
try:
args.append(getattr(aug_input, f))
except AttributeError as e:
raise AttributeError(
f"{type(aug)}.get_transform needs input attribute '{f}', "
f"but it is not an attribute of {type(aug_input)}!"
) from e
return args
class Augmentation:
"""
Augmentation defines (often random) policies/strategies to generate :class:`Transform`
from data. It is often used for pre-processing of input data.
A "policy" that generates a :class:`Transform` may, in the most general case,
need arbitrary information from input data in order to determine what transforms
to apply. Therefore, each :class:`Augmentation` instance defines the arguments
needed by its :meth:`get_transform` method. When called with the positional arguments,
the :meth:`get_transform` method executes the policy.
Note that :class:`Augmentation` defines the policies to create a :class:`Transform`,
but not how to execute the actual transform operations to those data.
Its :meth:`__call__` method will use :meth:`AugInput.transform` to execute the transform.
The returned `Transform` object is meant to describe deterministic transformation, which means
it can be re-applied on associated data, e.g. the geometry of an image and its segmentation
masks need to be transformed together.
(If such re-application is not needed, then determinism is not a crucial requirement.)
"""
input_args: Optional[Tuple[str]] = None
"""
Stores the attribute names needed by :meth:`get_transform`, e.g. ``("image", "sem_seg")``.
By default, it is just a tuple of argument names in :meth:`self.get_transform`, which often only
contain "image". As long as the argument name convention is followed, there is no need for
users to touch this attribute.
"""
def _init(self, params=None):
if params:
for k, v in params.items():
if k != "self" and not k.startswith("_"):
setattr(self, k, v)
def get_transform(self, *args) -> Transform:
"""
Execute the policy based on input data, and decide what transform to apply to inputs.
Args:
args: Any fixed-length positional arguments. By default, the name of the arguments
should exist in the :class:`AugInput` to be used.
Returns:
Transform: Returns the deterministic transform to apply to the input.
Examples:
::
class MyAug:
# if a policy needs to know both image and semantic segmentation
def get_transform(image, sem_seg) -> T.Transform:
pass
tfm: Transform = MyAug().get_transform(image, sem_seg)
new_image = tfm.apply_image(image)
Notes:
Users can freely use arbitrary new argument names in custom
:meth:`get_transform` method, as long as they are available in the
input data. In detectron2 we use the following convention:
* image: (H,W) or (H,W,C) ndarray of type uint8 in range [0, 255], or
floating point in range [0, 1] or [0, 255].
* boxes: (N,4) ndarray of float32. It represents the instance bounding boxes
of N instances. Each is in XYXY format in unit of absolute coordinates.
* sem_seg: (H,W) ndarray of type uint8. Each element is an integer label of pixel.
We do not specify convention for other types and do not include builtin
:class:`Augmentation` that uses other types in detectron2.
"""
raise NotImplementedError
def __call__(self, aug_input) -> Transform:
"""
Augment the given `aug_input` **in-place**, and return the transform that's used.
This method will be called to apply the augmentation. In most augmentation, it
is enough to use the default implementation, which calls :meth:`get_transform`
using the inputs. But a subclass can overwrite it to have more complicated logic.
Args:
aug_input (AugInput): an object that has attributes needed by this augmentation
(defined by ``self.get_transform``). Its ``transform`` method will be called
to in-place transform it.
Returns:
Transform: the transform that is applied on the input.
"""
args = _get_aug_input_args(self, aug_input)
tfm = self.get_transform(*args)
assert isinstance(tfm, (Transform, TransformList)), (
f"{type(self)}.get_transform must return an instance of Transform! "
f"Got {type(tfm)} instead."
)
aug_input.transform(tfm)
return tfm
def _rand_range(self, low=1.0, high=None, size=None):
"""
Uniform float random number between low and high.
"""
if high is None:
low, high = 0, low
if size is None:
size = []
return np.random.uniform(low, high, size)
def __repr__(self):
"""
Produce something like:
"MyAugmentation(field1={self.field1}, field2={self.field2})"
"""
try:
sig = inspect.signature(self.__init__)
classname = type(self).__name__
argstr = []
for name, param in sig.parameters.items():
assert (
param.kind != param.VAR_POSITIONAL and param.kind != param.VAR_KEYWORD
), "The default __repr__ doesn't support *args or **kwargs"
assert hasattr(self, name), (
"Attribute {} not found! "
"Default __repr__ only works if attributes match the constructor.".format(name)
)
attr = getattr(self, name)
default = param.default
if default is attr:
continue
attr_str = pprint.pformat(attr)
if "\n" in attr_str:
# don't show it if pformat decides to use >1 lines
attr_str = "..."
argstr.append("{}={}".format(name, attr_str))
return "{}({})".format(classname, ", ".join(argstr))
except AssertionError:
return super().__repr__()
__str__ = __repr__
def _transform_to_aug(tfm_or_aug):
"""
Wrap Transform into Augmentation.
Private, used internally to implement augmentations.
"""
assert isinstance(tfm_or_aug, (Transform, Augmentation)), tfm_or_aug
if isinstance(tfm_or_aug, Augmentation):
return tfm_or_aug
else:
class _TransformToAug(Augmentation):
def __init__(self, tfm: Transform):
self.tfm = tfm
def get_transform(self, *args):
return self.tfm
def __repr__(self):
return repr(self.tfm)
__str__ = __repr__
return _TransformToAug(tfm_or_aug)
class AugmentationList(Augmentation):
"""
Apply a sequence of augmentations.
It has ``__call__`` method to apply the augmentations.
Note that :meth:`get_transform` method is impossible (will throw error if called)
for :class:`AugmentationList`, because in order to apply a sequence of augmentations,
the kth augmentation must be applied first, to provide inputs needed by the (k+1)th
augmentation.
"""
def __init__(self, augs):
"""
Args:
augs (list[Augmentation or Transform]):
"""
super().__init__()
self.augs = [_transform_to_aug(x) for x in augs]
def __call__(self, aug_input) -> Transform:
tfms = []
for x in self.augs:
tfm = x(aug_input)
tfms.append(tfm)
return TransformList(tfms)
def __repr__(self):
msgs = [str(x) for x in self.augs]
return "AugmentationList[{}]".format(", ".join(msgs))
__str__ = __repr__
class AugInput:
"""
Input that can be used with :meth:`Augmentation.__call__`.
This is a standard implementation for the majority of use cases.
This class provides the standard attributes **"image", "boxes", "sem_seg"**
defined in :meth:`__init__` and they may be needed by different augmentations.
Most augmentation policies do not need attributes beyond these three.
After applying augmentations to these attributes (using :meth:`AugInput.transform`),
the returned transforms can then be used to transform other data structures that users have.
Examples:
::
input = AugInput(image, boxes=boxes)
tfms = augmentation(input)
transformed_image = input.image
transformed_boxes = input.boxes
transformed_other_data = tfms.apply_other(other_data)
An extended project that works with new data types may implement augmentation policies
that need other inputs. An algorithm may need to transform inputs in a way different
from the standard approach defined in this class. In those rare situations, users can
implement a class similar to this class, that satify the following condition:
* The input must provide access to these data in the form of attribute access
(``getattr``). For example, if an :class:`Augmentation` to be applied needs "image"
and "sem_seg" arguments, its input must have the attribute "image" and "sem_seg".
* The input must have a ``transform(tfm: Transform) -> None`` method which
in-place transforms all its attributes.
"""
# TODO maybe should support more builtin data types here
def __init__(
self,
image: np.ndarray,
*,
boxes: Optional[np.ndarray] = None,
sem_seg: Optional[np.ndarray] = None,
):
"""
Args:
image (ndarray): (H,W) or (H,W,C) ndarray of type uint8 in range [0, 255], or
floating point in range [0, 1] or [0, 255]. The meaning of C is up
to users.
boxes (ndarray or None): Nx4 float32 boxes in XYXY_ABS mode
sem_seg (ndarray or None): HxW uint8 semantic segmentation mask. Each element
is an integer label of pixel.
"""
_check_img_dtype(image)
self.image = image
self.boxes = boxes
self.sem_seg = sem_seg
def transform(self, tfm: Transform) -> None:
"""
In-place transform all attributes of this class.
By "in-place", it means after calling this method, accessing an attribute such
as ``self.image`` will return transformed data.
"""
self.image = tfm.apply_image(self.image)
if self.boxes is not None:
self.boxes = tfm.apply_box(self.boxes)
if self.sem_seg is not None:
self.sem_seg = tfm.apply_segmentation(self.sem_seg)
def apply_augmentations(
self, augmentations: List[Union[Augmentation, Transform]]
) -> TransformList:
"""
Equivalent of ``AugmentationList(augmentations)(self)``
"""
return AugmentationList(augmentations)(self)
def apply_augmentations(augmentations: List[Union[Transform, Augmentation]], inputs):
"""
Use ``T.AugmentationList(augmentations)(inputs)`` instead.
"""
if isinstance(inputs, np.ndarray):
# handle the common case of image-only Augmentation, also for backward compatibility
image_only = True
inputs = AugInput(inputs)
else:
image_only = False
tfms = inputs.apply_augmentations(augmentations)
return inputs.image if image_only else inputs, tfms
apply_transform_gens = apply_augmentations
"""
Alias for backward-compatibility.
"""
TransformGen = Augmentation
"""
Alias for Augmentation, since it is something that generates :class:`Transform`s
"""
StandardAugInput = AugInput
"""
Alias for compatibility. It's not worth the complexity to have two classes.
"""
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment