Merge branch 'add_v0.6' into 'v0.6-release'

support v0.6 See merge request !3

Merge branch 'add_v0.6' into 'v0.6-release'
support v0.6 See merge request !3
63e10e00 · limm · 2ca8e9fd · b634945d · 63e10e00 · 63e10e00
Commit 63e10e00 authored Apr 09, 2025 by limm
20 changed files
--- a/datasets/prepare_panoptic_fpn.py
+++ b/datasets/prepare_panoptic_fpn.py
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+import functools
+import json
+import multiprocessing as mp
+import numpy as np
+import os
+import time
+from fvcore.common.download import download
+from panopticapi.utils import rgb2id
+from PIL import Image
+from detectron2.data.datasets.builtin_meta import COCO_CATEGORIES
+def _process_panoptic_to_semantic(input_panoptic, output_semantic, segments, id_map):
+    panoptic = np.asarray(Image.open(input_panoptic), dtype=np.uint32)
+    panoptic = rgb2id(panoptic)
+    output = np.zeros_like(panoptic, dtype=np.uint8) + 255
+    for seg in segments:
+        cat_id = seg["category_id"]
+        new_cat_id = id_map[cat_id]
+        output[panoptic == seg["id"]] = new_cat_id
+    Image.fromarray(output).save(output_semantic)
+def separate_coco_semantic_from_panoptic(panoptic_json, panoptic_root, sem_seg_root, categories):
+    """
+    Create semantic segmentation annotations from panoptic segmentation
+    annotations, to be used by PanopticFPN.
+    It maps all thing categories to class 0, and maps all unlabeled pixels to class 255.
+    It maps all stuff categories to contiguous ids starting from 1.
+    Args:
+        panoptic_json (str): path to the panoptic json file, in COCO's format.
+        panoptic_root (str): a directory with panoptic annotation files, in COCO's format.
+        sem_seg_root (str): a directory to output semantic annotation files
+        categories (list[dict]): category metadata. Each dict needs to have:
+            "id": corresponds to the "category_id" in the json annotations
+            "isthing": 0 or 1
+    """
+    os.makedirs(sem_seg_root, exist_ok=True)
+    stuff_ids = [k["id"] for k in categories if k["isthing"] == 0]
+    thing_ids = [k["id"] for k in categories if k["isthing"] == 1]
+    id_map = {}  # map from category id to id in the output semantic annotation
+    assert len(stuff_ids) <= 254
+    for i, stuff_id in enumerate(stuff_ids):
+        id_map[stuff_id] = i + 1
+    for thing_id in thing_ids:
+        id_map[thing_id] = 0
+    id_map[0] = 255
+    with open(panoptic_json) as f:
+        obj = json.load(f)
+    pool = mp.Pool(processes=max(mp.cpu_count() // 2, 4))
+    def iter_annotations():
+        for anno in obj["annotations"]:
+            file_name = anno["file_name"]
+            segments = anno["segments_info"]
+            input = os.path.join(panoptic_root, file_name)
+            output = os.path.join(sem_seg_root, file_name)
+            yield input, output, segments
+    print("Start writing to {} ...".format(sem_seg_root))
+    start = time.time()
+    pool.starmap(
+        functools.partial(_process_panoptic_to_semantic, id_map=id_map),
+        iter_annotations(),
+        chunksize=100,
+    )
+    print("Finished. time: {:.2f}s".format(time.time() - start))
+if __name__ == "__main__":
+    dataset_dir = os.path.join(os.getenv("DETECTRON2_DATASETS", "datasets"), "coco")
+    for s in ["val2017", "train2017"]:
+        separate_coco_semantic_from_panoptic(
+            os.path.join(dataset_dir, "annotations/panoptic_{}.json".format(s)),
+            os.path.join(dataset_dir, "panoptic_{}".format(s)),
+            os.path.join(dataset_dir, "panoptic_stuff_{}".format(s)),
+            COCO_CATEGORIES,
+        )
+    # Prepare val2017_100 for quick testing:
+    dest_dir = os.path.join(dataset_dir, "annotations/")
+    URL_PREFIX = "https://dl.fbaipublicfiles.com/detectron2/"
+    download(URL_PREFIX + "annotations/coco/panoptic_val2017_100.json", dest_dir)
+    with open(os.path.join(dest_dir, "panoptic_val2017_100.json")) as f:
+        obj = json.load(f)
+    def link_val100(dir_full, dir_100):
+        print("Creating " + dir_100 + " ...")
+        os.makedirs(dir_100, exist_ok=True)
+        for img in obj["images"]:
+            basename = os.path.splitext(img["file_name"])[0]
+            src = os.path.join(dir_full, basename + ".png")
+            dst = os.path.join(dir_100, basename + ".png")
+            src = os.path.relpath(src, start=dir_100)
+            os.symlink(src, dst)
+    link_val100(
+        os.path.join(dataset_dir, "panoptic_val2017"),
+        os.path.join(dataset_dir, "panoptic_val2017_100"),
+    )
+    link_val100(
+        os.path.join(dataset_dir, "panoptic_stuff_val2017"),
+        os.path.join(dataset_dir, "panoptic_stuff_val2017_100"),
+    )
--- a/demo/README.md
+++ b/demo/README.md
+## Detectron2 Demo
+We provide a command line tool to run a simple demo of builtin configs.
+The usage is explained in [GETTING_STARTED.md](../GETTING_STARTED.md).
+See our [blog post](https://ai.facebook.com/blog/-detectron2-a-pytorch-based-modular-object-detection-library-)
+for a high-quality demo generated with this tool.
--- a/demo/demo.py
+++ b/demo/demo.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+import argparse
+import glob
+import multiprocessing as mp
+import numpy as np
+import os
+import tempfile
+import time
+import warnings
+import cv2
+import tqdm
+from detectron2.config import get_cfg
+from detectron2.data.detection_utils import read_image
+from detectron2.utils.logger import setup_logger
+from predictor import VisualizationDemo
+# constants
+WINDOW_NAME = "COCO detections"
+def setup_cfg(args):
+    # load config from file and command-line arguments
+    cfg = get_cfg()
+    # To use demo for Panoptic-DeepLab, please uncomment the following two lines.
+    # from detectron2.projects.panoptic_deeplab import add_panoptic_deeplab_config  # noqa
+    # add_panoptic_deeplab_config(cfg)
+    cfg.merge_from_file(args.config_file)
+    cfg.merge_from_list(args.opts)
+    # Set score_threshold for builtin models
+    cfg.MODEL.RETINANET.SCORE_THRESH_TEST = args.confidence_threshold
+    cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = args.confidence_threshold
+    cfg.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH = args.confidence_threshold
+    cfg.freeze()
+    return cfg
+def get_parser():
+    parser = argparse.ArgumentParser(description="Detectron2 demo for builtin configs")
+    parser.add_argument(
+        "--config-file",
+        default="configs/quick_schedules/mask_rcnn_R_50_FPN_inference_acc_test.yaml",
+        metavar="FILE",
+        help="path to config file",
+    )
+    parser.add_argument("--webcam", action="store_true", help="Take inputs from webcam.")
+    parser.add_argument("--video-input", help="Path to video file.")
+    parser.add_argument(
+        "--input",
+        nargs="+",
+        help="A list of space separated input images; "
+        "or a single glob pattern such as 'directory/*.jpg'",
+    )
+    parser.add_argument(
+        "--output",
+        help="A file or directory to save output visualizations. "
+        "If not given, will show output in an OpenCV window.",
+    )
+    parser.add_argument(
+        "--confidence-threshold",
+        type=float,
+        default=0.5,
+        help="Minimum score for instance predictions to be shown",
+    )
+    parser.add_argument(
+        "--opts",
+        help="Modify config options using the command-line 'KEY VALUE' pairs",
+        default=[],
+        nargs=argparse.REMAINDER,
+    )
+    return parser
+def test_opencv_video_format(codec, file_ext):
+    with tempfile.TemporaryDirectory(prefix="video_format_test") as dir:
+        filename = os.path.join(dir, "test_file" + file_ext)
+        writer = cv2.VideoWriter(
+            filename=filename,
+            fourcc=cv2.VideoWriter_fourcc(*codec),
+            fps=float(30),
+            frameSize=(10, 10),
+            isColor=True,
+        )
+        [writer.write(np.zeros((10, 10, 3), np.uint8)) for _ in range(30)]
+        writer.release()
+        if os.path.isfile(filename):
+            return True
+        return False
+if __name__ == "__main__":
+    mp.set_start_method("spawn", force=True)
+    args = get_parser().parse_args()
+    setup_logger(name="fvcore")
+    logger = setup_logger()
+    logger.info("Arguments: " + str(args))
+    cfg = setup_cfg(args)
+    demo = VisualizationDemo(cfg)
+    if args.input:
+        if len(args.input) == 1:
+            args.input = glob.glob(os.path.expanduser(args.input[0]))
+            assert args.input, "The input path(s) was not found"
+        for path in tqdm.tqdm(args.input, disable=not args.output):
+            # use PIL, to be consistent with evaluation
+            img = read_image(path, format="BGR")
+            start_time = time.time()
+            predictions, visualized_output = demo.run_on_image(img)
+            logger.info(
+                "{}: {} in {:.2f}s".format(
+                    path,
+                    "detected {} instances".format(len(predictions["instances"]))
+                    if "instances" in predictions
+                    else "finished",
+                    time.time() - start_time,
+                )
+            )
+            if args.output:
+                if os.path.isdir(args.output):
+                    assert os.path.isdir(args.output), args.output
+                    out_filename = os.path.join(args.output, os.path.basename(path))
+                else:
+                    assert len(args.input) == 1, "Please specify a directory with args.output"
+                    out_filename = args.output
+                visualized_output.save(out_filename)
+            else:
+                cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL)
+                cv2.imshow(WINDOW_NAME, visualized_output.get_image()[:, :, ::-1])
+                if cv2.waitKey(0) == 27:
+                    break  # esc to quit
+    elif args.webcam:
+        assert args.input is None, "Cannot have both --input and --webcam!"
+        assert args.output is None, "output not yet supported with --webcam!"
+        cam = cv2.VideoCapture(0)
+        for vis in tqdm.tqdm(demo.run_on_video(cam)):
+            cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL)
+            cv2.imshow(WINDOW_NAME, vis)
+            if cv2.waitKey(1) == 27:
+                break  # esc to quit
+        cam.release()
+        cv2.destroyAllWindows()
+    elif args.video_input:
+        video = cv2.VideoCapture(args.video_input)
+        width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        frames_per_second = video.get(cv2.CAP_PROP_FPS)
+        num_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
+        basename = os.path.basename(args.video_input)
+        codec, file_ext = (
+            ("x264", ".mkv") if test_opencv_video_format("x264", ".mkv") else ("mp4v", ".mp4")
+        )
+        if codec == ".mp4v":
+            warnings.warn("x264 codec not available, switching to mp4v")
+        if args.output:
+            if os.path.isdir(args.output):
+                output_fname = os.path.join(args.output, basename)
+                output_fname = os.path.splitext(output_fname)[0] + file_ext
+            else:
+                output_fname = args.output
+            assert not os.path.isfile(output_fname), output_fname
+            output_file = cv2.VideoWriter(
+                filename=output_fname,
+                # some installation of opencv may not support x264 (due to its license),
+                # you can try other format (e.g. MPEG)
+                fourcc=cv2.VideoWriter_fourcc(*codec),
+                fps=float(frames_per_second),
+                frameSize=(width, height),
+                isColor=True,
+            )
+        assert os.path.isfile(args.video_input)
+        for vis_frame in tqdm.tqdm(demo.run_on_video(video), total=num_frames):
+            if args.output:
+                output_file.write(vis_frame)
+            else:
+                cv2.namedWindow(basename, cv2.WINDOW_NORMAL)
+                cv2.imshow(basename, vis_frame)
+                if cv2.waitKey(1) == 27:
+                    break  # esc to quit
+        video.release()
+        if args.output:
+            output_file.release()
+        else:
+            cv2.destroyAllWindows()
--- a/demo/predictor.py
+++ b/demo/predictor.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+import atexit
+import bisect
+import multiprocessing as mp
+from collections import deque
+import cv2
+import torch
+from detectron2.data import MetadataCatalog
+from detectron2.engine.defaults import DefaultPredictor
+from detectron2.utils.video_visualizer import VideoVisualizer
+from detectron2.utils.visualizer import ColorMode, Visualizer
+class VisualizationDemo(object):
+    def __init__(self, cfg, instance_mode=ColorMode.IMAGE, parallel=False):
+        """
+        Args:
+            cfg (CfgNode):
+            instance_mode (ColorMode):
+            parallel (bool): whether to run the model in different processes from visualization.
+                Useful since the visualization logic can be slow.
+        """
+        self.metadata = MetadataCatalog.get(
+            cfg.DATASETS.TEST[0] if len(cfg.DATASETS.TEST) else "__unused"
+        )
+        self.cpu_device = torch.device("cpu")
+        self.instance_mode = instance_mode
+        self.parallel = parallel
+        if parallel:
+            num_gpu = torch.cuda.device_count()
+            self.predictor = AsyncPredictor(cfg, num_gpus=num_gpu)
+        else:
+            self.predictor = DefaultPredictor(cfg)
+    def run_on_image(self, image):
+        """
+        Args:
+            image (np.ndarray): an image of shape (H, W, C) (in BGR order).
+                This is the format used by OpenCV.
+        Returns:
+            predictions (dict): the output of the model.
+            vis_output (VisImage): the visualized image output.
+        """
+        vis_output = None
+        predictions = self.predictor(image)
+        # Convert image from OpenCV BGR format to Matplotlib RGB format.
+        image = image[:, :, ::-1]
+        visualizer = Visualizer(image, self.metadata, instance_mode=self.instance_mode)
+        if "panoptic_seg" in predictions:
+            panoptic_seg, segments_info = predictions["panoptic_seg"]
+            vis_output = visualizer.draw_panoptic_seg_predictions(
+                panoptic_seg.to(self.cpu_device), segments_info
+            )
+        else:
+            if "sem_seg" in predictions:
+                vis_output = visualizer.draw_sem_seg(
+                    predictions["sem_seg"].argmax(dim=0).to(self.cpu_device)
+                )
+            if "instances" in predictions:
+                instances = predictions["instances"].to(self.cpu_device)
+                vis_output = visualizer.draw_instance_predictions(predictions=instances)
+        return predictions, vis_output
+    def _frame_from_video(self, video):
+        while video.isOpened():
+            success, frame = video.read()
+            if success:
+                yield frame
+            else:
+                break
+    def run_on_video(self, video):
+        """
+        Visualizes predictions on frames of the input video.
+        Args:
+            video (cv2.VideoCapture): a :class:`VideoCapture` object, whose source can be
+                either a webcam or a video file.
+        Yields:
+            ndarray: BGR visualizations of each video frame.
+        """
+        video_visualizer = VideoVisualizer(self.metadata, self.instance_mode)
+        def process_predictions(frame, predictions):
+            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            if "panoptic_seg" in predictions:
+                panoptic_seg, segments_info = predictions["panoptic_seg"]
+                vis_frame = video_visualizer.draw_panoptic_seg_predictions(
+                    frame, panoptic_seg.to(self.cpu_device), segments_info
+                )
+            elif "instances" in predictions:
+                predictions = predictions["instances"].to(self.cpu_device)
+                vis_frame = video_visualizer.draw_instance_predictions(frame, predictions)
+            elif "sem_seg" in predictions:
+                vis_frame = video_visualizer.draw_sem_seg(
+                    frame, predictions["sem_seg"].argmax(dim=0).to(self.cpu_device)
+                )
+            # Converts Matplotlib RGB format to OpenCV BGR format
+            vis_frame = cv2.cvtColor(vis_frame.get_image(), cv2.COLOR_RGB2BGR)
+            return vis_frame
+        frame_gen = self._frame_from_video(video)
+        if self.parallel:
+            buffer_size = self.predictor.default_buffer_size
+            frame_data = deque()
+            for cnt, frame in enumerate(frame_gen):
+                frame_data.append(frame)
+                self.predictor.put(frame)
+                if cnt >= buffer_size:
+                    frame = frame_data.popleft()
+                    predictions = self.predictor.get()
+                    yield process_predictions(frame, predictions)
+            while len(frame_data):
+                frame = frame_data.popleft()
+                predictions = self.predictor.get()
+                yield process_predictions(frame, predictions)
+        else:
+            for frame in frame_gen:
+                yield process_predictions(frame, self.predictor(frame))
+class AsyncPredictor:
+    """
+    A predictor that runs the model asynchronously, possibly on >1 GPUs.
+    Because rendering the visualization takes considerably amount of time,
+    this helps improve throughput a little bit when rendering videos.
+    """
+    class _StopToken:
+        pass
+    class _PredictWorker(mp.Process):
+        def __init__(self, cfg, task_queue, result_queue):
+            self.cfg = cfg
+            self.task_queue = task_queue
+            self.result_queue = result_queue
+            super().__init__()
+        def run(self):
+            predictor = DefaultPredictor(self.cfg)
+            while True:
+                task = self.task_queue.get()
+                if isinstance(task, AsyncPredictor._StopToken):
+                    break
+                idx, data = task
+                result = predictor(data)
+                self.result_queue.put((idx, result))
+    def __init__(self, cfg, num_gpus: int = 1):
+        """
+        Args:
+            cfg (CfgNode):
+            num_gpus (int): if 0, will run on CPU
+        """
+        num_workers = max(num_gpus, 1)
+        self.task_queue = mp.Queue(maxsize=num_workers * 3)
+        self.result_queue = mp.Queue(maxsize=num_workers * 3)
+        self.procs = []
+        for gpuid in range(max(num_gpus, 1)):
+            cfg = cfg.clone()
+            cfg.defrost()
+            cfg.MODEL.DEVICE = "cuda:{}".format(gpuid) if num_gpus > 0 else "cpu"
+            self.procs.append(
+                AsyncPredictor._PredictWorker(cfg, self.task_queue, self.result_queue)
+            )
+        self.put_idx = 0
+        self.get_idx = 0
+        self.result_rank = []
+        self.result_data = []
+        for p in self.procs:
+            p.start()
+        atexit.register(self.shutdown)
+    def put(self, image):
+        self.put_idx += 1
+        self.task_queue.put((self.put_idx, image))
+    def get(self):
+        self.get_idx += 1  # the index needed for this request
+        if len(self.result_rank) and self.result_rank[0] == self.get_idx:
+            res = self.result_data[0]
+            del self.result_data[0], self.result_rank[0]
+            return res
+        while True:
+            # make sure the results are returned in the correct order
+            idx, res = self.result_queue.get()
+            if idx == self.get_idx:
+                return res
+            insert = bisect.bisect(self.result_rank, idx)
+            self.result_rank.insert(insert, idx)
+            self.result_data.insert(insert, res)
+    def __len__(self):
+        return self.put_idx - self.get_idx
+    def __call__(self, image):
+        self.put(image)
+        return self.get()
+    def shutdown(self):
+        for _ in self.procs:
+            self.task_queue.put(AsyncPredictor._StopToken())
+    @property
+    def default_buffer_size(self):
+        return len(self.procs) * 5
--- a/detectron2/__init__.py
+++ b/detectron2/__init__.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+from .utils.env import setup_environment
+setup_environment()
+# This line will be programatically read/write by setup.py.
+# Leave them at the bottom of this file and don't touch them.
+__version__ = "0.6"
--- a/detectron2/checkpoint/__init__.py
+++ b/detectron2/checkpoint/__init__.py
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+# File:
+from . import catalog as _UNUSED  # register the handler
+from .detection_checkpoint import DetectionCheckpointer
+from fvcore.common.checkpoint import Checkpointer, PeriodicCheckpointer
+__all__ = ["Checkpointer", "PeriodicCheckpointer", "DetectionCheckpointer"]
--- a/detectron2/checkpoint/c2_model_loading.py
+++ b/detectron2/checkpoint/c2_model_loading.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+import copy
+import logging
+import re
+from typing import Dict, List
+import torch
+from tabulate import tabulate
+def convert_basic_c2_names(original_keys):
+    """
+    Apply some basic name conversion to names in C2 weights.
+    It only deals with typical backbone models.
+    Args:
+        original_keys (list[str]):
+    Returns:
+        list[str]: The same number of strings matching those in original_keys.
+    """
+    layer_keys = copy.deepcopy(original_keys)
+    layer_keys = [
+        {"pred_b": "linear_b", "pred_w": "linear_w"}.get(k, k) for k in layer_keys
+    ]  # some hard-coded mappings
+    layer_keys = [k.replace("_", ".") for k in layer_keys]
+    layer_keys = [re.sub("\\.b$", ".bias", k) for k in layer_keys]
+    layer_keys = [re.sub("\\.w$", ".weight", k) for k in layer_keys]
+    # Uniform both bn and gn names to "norm"
+    layer_keys = [re.sub("bn\\.s$", "norm.weight", k) for k in layer_keys]
+    layer_keys = [re.sub("bn\\.bias$", "norm.bias", k) for k in layer_keys]
+    layer_keys = [re.sub("bn\\.rm", "norm.running_mean", k) for k in layer_keys]
+    layer_keys = [re.sub("bn\\.running.mean$", "norm.running_mean", k) for k in layer_keys]
+    layer_keys = [re.sub("bn\\.riv$", "norm.running_var", k) for k in layer_keys]
+    layer_keys = [re.sub("bn\\.running.var$", "norm.running_var", k) for k in layer_keys]
+    layer_keys = [re.sub("bn\\.gamma$", "norm.weight", k) for k in layer_keys]
+    layer_keys = [re.sub("bn\\.beta$", "norm.bias", k) for k in layer_keys]
+    layer_keys = [re.sub("gn\\.s$", "norm.weight", k) for k in layer_keys]
+    layer_keys = [re.sub("gn\\.bias$", "norm.bias", k) for k in layer_keys]
+    # stem
+    layer_keys = [re.sub("^res\\.conv1\\.norm\\.", "conv1.norm.", k) for k in layer_keys]
+    # to avoid mis-matching with "conv1" in other components (e.g. detection head)
+    layer_keys = [re.sub("^conv1\\.", "stem.conv1.", k) for k in layer_keys]
+    # layer1-4 is used by torchvision, however we follow the C2 naming strategy (res2-5)
+    # layer_keys = [re.sub("^res2.", "layer1.", k) for k in layer_keys]
+    # layer_keys = [re.sub("^res3.", "layer2.", k) for k in layer_keys]
+    # layer_keys = [re.sub("^res4.", "layer3.", k) for k in layer_keys]
+    # layer_keys = [re.sub("^res5.", "layer4.", k) for k in layer_keys]
+    # blocks
+    layer_keys = [k.replace(".branch1.", ".shortcut.") for k in layer_keys]
+    layer_keys = [k.replace(".branch2a.", ".conv1.") for k in layer_keys]
+    layer_keys = [k.replace(".branch2b.", ".conv2.") for k in layer_keys]
+    layer_keys = [k.replace(".branch2c.", ".conv3.") for k in layer_keys]
+    # DensePose substitutions
+    layer_keys = [re.sub("^body.conv.fcn", "body_conv_fcn", k) for k in layer_keys]
+    layer_keys = [k.replace("AnnIndex.lowres", "ann_index_lowres") for k in layer_keys]
+    layer_keys = [k.replace("Index.UV.lowres", "index_uv_lowres") for k in layer_keys]
+    layer_keys = [k.replace("U.lowres", "u_lowres") for k in layer_keys]
+    layer_keys = [k.replace("V.lowres", "v_lowres") for k in layer_keys]
+    return layer_keys
+def convert_c2_detectron_names(weights):
+    """
+    Map Caffe2 Detectron weight names to Detectron2 names.
+    Args:
+        weights (dict): name -> tensor
+    Returns:
+        dict: detectron2 names -> tensor
+        dict: detectron2 names -> C2 names
+    """
+    logger = logging.getLogger(__name__)
+    logger.info("Renaming Caffe2 weights ......")
+    original_keys = sorted(weights.keys())
+    layer_keys = copy.deepcopy(original_keys)
+    layer_keys = convert_basic_c2_names(layer_keys)
+    # --------------------------------------------------------------------------
+    # RPN hidden representation conv
+    # --------------------------------------------------------------------------
+    # FPN case
+    # In the C2 model, the RPN hidden layer conv is defined for FPN level 2 and then
+    # shared for all other levels, hence the appearance of "fpn2"
+    layer_keys = [
+        k.replace("conv.rpn.fpn2", "proposal_generator.rpn_head.conv") for k in layer_keys
+    ]
+    # Non-FPN case
+    layer_keys = [k.replace("conv.rpn", "proposal_generator.rpn_head.conv") for k in layer_keys]
+    # --------------------------------------------------------------------------
+    # RPN box transformation conv
+    # --------------------------------------------------------------------------
+    # FPN case (see note above about "fpn2")
+    layer_keys = [
+        k.replace("rpn.bbox.pred.fpn2", "proposal_generator.rpn_head.anchor_deltas")
+        for k in layer_keys
+    ]
+    layer_keys = [
+        k.replace("rpn.cls.logits.fpn2", "proposal_generator.rpn_head.objectness_logits")
+        for k in layer_keys
+    ]
+    # Non-FPN case
+    layer_keys = [
+        k.replace("rpn.bbox.pred", "proposal_generator.rpn_head.anchor_deltas") for k in layer_keys
+    ]
+    layer_keys = [
+        k.replace("rpn.cls.logits", "proposal_generator.rpn_head.objectness_logits")
+        for k in layer_keys
+    ]
+    # --------------------------------------------------------------------------
+    # Fast R-CNN box head
+    # --------------------------------------------------------------------------
+    layer_keys = [re.sub("^bbox\\.pred", "bbox_pred", k) for k in layer_keys]
+    layer_keys = [re.sub("^cls\\.score", "cls_score", k) for k in layer_keys]
+    layer_keys = [re.sub("^fc6\\.", "box_head.fc1.", k) for k in layer_keys]
+    layer_keys = [re.sub("^fc7\\.", "box_head.fc2.", k) for k in layer_keys]
+    # 4conv1fc head tensor names: head_conv1_w, head_conv1_gn_s
+    layer_keys = [re.sub("^head\\.conv", "box_head.conv", k) for k in layer_keys]
+    # --------------------------------------------------------------------------
+    # FPN lateral and output convolutions
+    # --------------------------------------------------------------------------
+    def fpn_map(name):
+        """
+        Look for keys with the following patterns:
+        1) Starts with "fpn.inner."
+           Example: "fpn.inner.res2.2.sum.lateral.weight"
+           Meaning: These are lateral pathway convolutions
+        2) Starts with "fpn.res"
+           Example: "fpn.res2.2.sum.weight"
+           Meaning: These are FPN output convolutions
+        """
+        splits = name.split(".")
+        norm = ".norm" if "norm" in splits else ""
+        if name.startswith("fpn.inner."):
+            # splits example: ['fpn', 'inner', 'res2', '2', 'sum', 'lateral', 'weight']
+            stage = int(splits[2][len("res") :])
+            return "fpn_lateral{}{}.{}".format(stage, norm, splits[-1])
+        elif name.startswith("fpn.res"):
+            # splits example: ['fpn', 'res2', '2', 'sum', 'weight']
+            stage = int(splits[1][len("res") :])
+            return "fpn_output{}{}.{}".format(stage, norm, splits[-1])
+        return name
+    layer_keys = [fpn_map(k) for k in layer_keys]
+    # --------------------------------------------------------------------------
+    # Mask R-CNN mask head
+    # --------------------------------------------------------------------------
+    # roi_heads.StandardROIHeads case
+    layer_keys = [k.replace(".[mask].fcn", "mask_head.mask_fcn") for k in layer_keys]
+    layer_keys = [re.sub("^\\.mask\\.fcn", "mask_head.mask_fcn", k) for k in layer_keys]
+    layer_keys = [k.replace("mask.fcn.logits", "mask_head.predictor") for k in layer_keys]
+    # roi_heads.Res5ROIHeads case
+    layer_keys = [k.replace("conv5.mask", "mask_head.deconv") for k in layer_keys]
+    # --------------------------------------------------------------------------
+    # Keypoint R-CNN head
+    # --------------------------------------------------------------------------
+    # interestingly, the keypoint head convs have blob names that are simply "conv_fcnX"
+    layer_keys = [k.replace("conv.fcn", "roi_heads.keypoint_head.conv_fcn") for k in layer_keys]
+    layer_keys = [
+        k.replace("kps.score.lowres", "roi_heads.keypoint_head.score_lowres") for k in layer_keys
+    ]
+    layer_keys = [k.replace("kps.score.", "roi_heads.keypoint_head.score.") for k in layer_keys]
+    # --------------------------------------------------------------------------
+    # Done with replacements
+    # --------------------------------------------------------------------------
+    assert len(set(layer_keys)) == len(layer_keys)
+    assert len(original_keys) == len(layer_keys)
+    new_weights = {}
+    new_keys_to_original_keys = {}
+    for orig, renamed in zip(original_keys, layer_keys):
+        new_keys_to_original_keys[renamed] = orig
+        if renamed.startswith("bbox_pred.") or renamed.startswith("mask_head.predictor."):
+            # remove the meaningless prediction weight for background class
+            new_start_idx = 4 if renamed.startswith("bbox_pred.") else 1
+            new_weights[renamed] = weights[orig][new_start_idx:]
+            logger.info(
+                "Remove prediction weight for background class in {}. The shape changes from "
+                "{} to {}.".format(
+                    renamed, tuple(weights[orig].shape), tuple(new_weights[renamed].shape)
+                )
+            )
+        elif renamed.startswith("cls_score."):
+            # move weights of bg class from original index 0 to last index
+            logger.info(
+                "Move classification weights for background class in {} from index 0 to "
+                "index {}.".format(renamed, weights[orig].shape[0] - 1)
+            )
+            new_weights[renamed] = torch.cat([weights[orig][1:], weights[orig][:1]])
+        else:
+            new_weights[renamed] = weights[orig]
+    return new_weights, new_keys_to_original_keys
+# Note the current matching is not symmetric.
+# it assumes model_state_dict will have longer names.
+def align_and_update_state_dicts(model_state_dict, ckpt_state_dict, c2_conversion=True):
+    """
+    Match names between the two state-dict, and returns a new chkpt_state_dict with names
+    converted to match model_state_dict with heuristics. The returned dict can be later
+    loaded with fvcore checkpointer.
+    If `c2_conversion==True`, `ckpt_state_dict` is assumed to be a Caffe2
+    model and will be renamed at first.
+    Strategy: suppose that the models that we will create will have prefixes appended
+    to each of its keys, for example due to an extra level of nesting that the original
+    pre-trained weights from ImageNet won't contain. For example, model.state_dict()
+    might return backbone[0].body.res2.conv1.weight, while the pre-trained model contains
+    res2.conv1.weight. We thus want to match both parameters together.
+    For that, we look for each model weight, look among all loaded keys if there is one
+    that is a suffix of the current weight name, and use it if that's the case.
+    If multiple matches exist, take the one with longest size
+    of the corresponding name. For example, for the same model as before, the pretrained
+    weight file can contain both res2.conv1.weight, as well as conv1.weight. In this case,
+    we want to match backbone[0].body.conv1.weight to conv1.weight, and
+    backbone[0].body.res2.conv1.weight to res2.conv1.weight.
+    """
+    model_keys = sorted(model_state_dict.keys())
+    if c2_conversion:
+        ckpt_state_dict, original_keys = convert_c2_detectron_names(ckpt_state_dict)
+        # original_keys: the name in the original dict (before renaming)
+    else:
+        original_keys = {x: x for x in ckpt_state_dict.keys()}
+    ckpt_keys = sorted(ckpt_state_dict.keys())
+    def match(a, b):
+        # Matched ckpt_key should be a complete (starts with '.') suffix.
+        # For example, roi_heads.mesh_head.whatever_conv1 does not match conv1,
+        # but matches whatever_conv1 or mesh_head.whatever_conv1.
+        return a == b or a.endswith("." + b)
+    # get a matrix of string matches, where each (i, j) entry correspond to the size of the
+    # ckpt_key string, if it matches
+    match_matrix = [len(j) if match(i, j) else 0 for i in model_keys for j in ckpt_keys]
+    match_matrix = torch.as_tensor(match_matrix).view(len(model_keys), len(ckpt_keys))
+    # use the matched one with longest size in case of multiple matches
+    max_match_size, idxs = match_matrix.max(1)
+    # remove indices that correspond to no-match
+    idxs[max_match_size == 0] = -1
+    logger = logging.getLogger(__name__)
+    # matched_pairs (matched checkpoint key --> matched model key)
+    matched_keys = {}
+    result_state_dict = {}
+    for idx_model, idx_ckpt in enumerate(idxs.tolist()):
+        if idx_ckpt == -1:
+            continue
+        key_model = model_keys[idx_model]
+        key_ckpt = ckpt_keys[idx_ckpt]
+        value_ckpt = ckpt_state_dict[key_ckpt]
+        shape_in_model = model_state_dict[key_model].shape
+        if shape_in_model != value_ckpt.shape:
+            logger.warning(
+                "Shape of {} in checkpoint is {}, while shape of {} in model is {}.".format(
+                    key_ckpt, value_ckpt.shape, key_model, shape_in_model
+                )
+            )
+            logger.warning(
+                "{} will not be loaded. Please double check and see if this is desired.".format(
+                    key_ckpt
+                )
+            )
+            continue
+        assert key_model not in result_state_dict
+        result_state_dict[key_model] = value_ckpt
+        if key_ckpt in matched_keys:  # already added to matched_keys
+            logger.error(
+                "Ambiguity found for {} in checkpoint!"
+                "It matches at least two keys in the model ({} and {}).".format(
+                    key_ckpt, key_model, matched_keys[key_ckpt]
+                )
+            )
+            raise ValueError("Cannot match one checkpoint key to multiple keys in the model.")
+        matched_keys[key_ckpt] = key_model
+    # logging:
+    matched_model_keys = sorted(matched_keys.values())
+    if len(matched_model_keys) == 0:
+        logger.warning("No weights in checkpoint matched with model.")
+        return ckpt_state_dict
+    common_prefix = _longest_common_prefix(matched_model_keys)
+    rev_matched_keys = {v: k for k, v in matched_keys.items()}
+    original_keys = {k: original_keys[rev_matched_keys[k]] for k in matched_model_keys}
+    model_key_groups = _group_keys_by_module(matched_model_keys, original_keys)
+    table = []
+    memo = set()
+    for key_model in matched_model_keys:
+        if key_model in memo:
+            continue
+        if key_model in model_key_groups:
+            group = model_key_groups[key_model]
+            memo |= set(group)
+            shapes = [tuple(model_state_dict[k].shape) for k in group]
+            table.append(
+                (
+                    _longest_common_prefix([k[len(common_prefix) :] for k in group]) + "*",
+                    _group_str([original_keys[k] for k in group]),
+                    " ".join([str(x).replace(" ", "") for x in shapes]),
+                )
+            )
+        else:
+            key_checkpoint = original_keys[key_model]
+            shape = str(tuple(model_state_dict[key_model].shape))
+            table.append((key_model[len(common_prefix) :], key_checkpoint, shape))
+    table_str = tabulate(
+        table, tablefmt="pipe", headers=["Names in Model", "Names in Checkpoint", "Shapes"]
+    )
+    logger.info(
+        "Following weights matched with "
+        + (f"submodule {common_prefix[:-1]}" if common_prefix else "model")
+        + ":\n"
+        + table_str
+    )
+    unmatched_ckpt_keys = [k for k in ckpt_keys if k not in set(matched_keys.keys())]
+    for k in unmatched_ckpt_keys:
+        result_state_dict[k] = ckpt_state_dict[k]
+    return result_state_dict
+def _group_keys_by_module(keys: List[str], original_names: Dict[str, str]):
+    """
+    Params in the same submodule are grouped together.
+    Args:
+        keys: names of all parameters
+        original_names: mapping from parameter name to their name in the checkpoint
+    Returns:
+        dict[name -> all other names in the same group]
+    """
+    def _submodule_name(key):
+        pos = key.rfind(".")
+        if pos < 0:
+            return None
+        prefix = key[: pos + 1]
+        return prefix
+    all_submodules = [_submodule_name(k) for k in keys]
+    all_submodules = [x for x in all_submodules if x]
+    all_submodules = sorted(all_submodules, key=len)
+    ret = {}
+    for prefix in all_submodules:
+        group = [k for k in keys if k.startswith(prefix)]
+        if len(group) <= 1:
+            continue
+        original_name_lcp = _longest_common_prefix_str([original_names[k] for k in group])
+        if len(original_name_lcp) == 0:
+            # don't group weights if original names don't share prefix
+            continue
+        for k in group:
+            if k in ret:
+                continue
+            ret[k] = group
+    return ret
+def _longest_common_prefix(names: List[str]) -> str:
+    """
+    ["abc.zfg", "abc.zef"] -> "abc."
+    """
+    names = [n.split(".") for n in names]
+    m1, m2 = min(names), max(names)
+    ret = [a for a, b in zip(m1, m2) if a == b]
+    ret = ".".join(ret) + "." if len(ret) else ""
+    return ret
+def _longest_common_prefix_str(names: List[str]) -> str:
+    m1, m2 = min(names), max(names)
+    lcp = [a for a, b in zip(m1, m2) if a == b]
+    lcp = "".join(lcp)
+    return lcp
+def _group_str(names: List[str]) -> str:
+    """
+    Turn "common1", "common2", "common3" into "common{1,2,3}"
+    """
+    lcp = _longest_common_prefix_str(names)
+    rest = [x[len(lcp) :] for x in names]
+    rest = "{" + ",".join(rest) + "}"
+    ret = lcp + rest
+    # add some simplification for BN specifically
+    ret = ret.replace("bn_{beta,running_mean,running_var,gamma}", "bn_*")
+    ret = ret.replace("bn_beta,bn_running_mean,bn_running_var,bn_gamma", "bn_*")
+    return ret
--- a/detectron2/checkpoint/catalog.py
+++ b/detectron2/checkpoint/catalog.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+import logging
+from detectron2.utils.file_io import PathHandler, PathManager
+class ModelCatalog(object):
+    """
+    Store mappings from names to third-party models.
+    """
+    S3_C2_DETECTRON_PREFIX = "https://dl.fbaipublicfiles.com/detectron"
+    # MSRA models have STRIDE_IN_1X1=True. False otherwise.
+    # NOTE: all BN models here have fused BN into an affine layer.
+    # As a result, you should only load them to a model with "FrozenBN".
+    # Loading them to a model with regular BN or SyncBN is wrong.
+    # Even when loaded to FrozenBN, it is still different from affine by an epsilon,
+    # which should be negligible for training.
+    # NOTE: all models here uses PIXEL_STD=[1,1,1]
+    # NOTE: Most of the BN models here are no longer used. We use the
+    # re-converted pre-trained models under detectron2 model zoo instead.
+    C2_IMAGENET_MODELS = {
+        "MSRA/R-50": "ImageNetPretrained/MSRA/R-50.pkl",
+        "MSRA/R-101": "ImageNetPretrained/MSRA/R-101.pkl",
+        "FAIR/R-50-GN": "ImageNetPretrained/47261647/R-50-GN.pkl",
+        "FAIR/R-101-GN": "ImageNetPretrained/47592356/R-101-GN.pkl",
+        "FAIR/X-101-32x8d": "ImageNetPretrained/20171220/X-101-32x8d.pkl",
+        "FAIR/X-101-64x4d": "ImageNetPretrained/FBResNeXt/X-101-64x4d.pkl",
+        "FAIR/X-152-32x8d-IN5k": "ImageNetPretrained/25093814/X-152-32x8d-IN5k.pkl",
+    }
+    C2_DETECTRON_PATH_FORMAT = (
+        "{prefix}/{url}/output/train/{dataset}/{type}/model_final.pkl"  # noqa B950
+    )
+    C2_DATASET_COCO = "coco_2014_train%3Acoco_2014_valminusminival"
+    C2_DATASET_COCO_KEYPOINTS = "keypoints_coco_2014_train%3Akeypoints_coco_2014_valminusminival"
+    # format: {model_name} -> part of the url
+    C2_DETECTRON_MODELS = {
+        "35857197/e2e_faster_rcnn_R-50-C4_1x": "35857197/12_2017_baselines/e2e_faster_rcnn_R-50-C4_1x.yaml.01_33_49.iAX0mXvW",  # noqa B950
+        "35857345/e2e_faster_rcnn_R-50-FPN_1x": "35857345/12_2017_baselines/e2e_faster_rcnn_R-50-FPN_1x.yaml.01_36_30.cUF7QR7I",  # noqa B950
+        "35857890/e2e_faster_rcnn_R-101-FPN_1x": "35857890/12_2017_baselines/e2e_faster_rcnn_R-101-FPN_1x.yaml.01_38_50.sNxI7sX7",  # noqa B950
+        "36761737/e2e_faster_rcnn_X-101-32x8d-FPN_1x": "36761737/12_2017_baselines/e2e_faster_rcnn_X-101-32x8d-FPN_1x.yaml.06_31_39.5MIHi1fZ",  # noqa B950
+        "35858791/e2e_mask_rcnn_R-50-C4_1x": "35858791/12_2017_baselines/e2e_mask_rcnn_R-50-C4_1x.yaml.01_45_57.ZgkA7hPB",  # noqa B950
+        "35858933/e2e_mask_rcnn_R-50-FPN_1x": "35858933/12_2017_baselines/e2e_mask_rcnn_R-50-FPN_1x.yaml.01_48_14.DzEQe4wC",  # noqa B950
+        "35861795/e2e_mask_rcnn_R-101-FPN_1x": "35861795/12_2017_baselines/e2e_mask_rcnn_R-101-FPN_1x.yaml.02_31_37.KqyEK4tT",  # noqa B950
+        "36761843/e2e_mask_rcnn_X-101-32x8d-FPN_1x": "36761843/12_2017_baselines/e2e_mask_rcnn_X-101-32x8d-FPN_1x.yaml.06_35_59.RZotkLKI",  # noqa B950
+        "48616381/e2e_mask_rcnn_R-50-FPN_2x_gn": "GN/48616381/04_2018_gn_baselines/e2e_mask_rcnn_R-50-FPN_2x_gn_0416.13_23_38.bTlTI97Q",  # noqa B950
+        "37697547/e2e_keypoint_rcnn_R-50-FPN_1x": "37697547/12_2017_baselines/e2e_keypoint_rcnn_R-50-FPN_1x.yaml.08_42_54.kdzV35ao",  # noqa B950
+        "35998355/rpn_R-50-C4_1x": "35998355/12_2017_baselines/rpn_R-50-C4_1x.yaml.08_00_43.njH5oD9L",  # noqa B950
+        "35998814/rpn_R-50-FPN_1x": "35998814/12_2017_baselines/rpn_R-50-FPN_1x.yaml.08_06_03.Axg0r179",  # noqa B950
+        "36225147/fast_R-50-FPN_1x": "36225147/12_2017_baselines/fast_rcnn_R-50-FPN_1x.yaml.08_39_09.L3obSdQ2",  # noqa B950
+    }
+    @staticmethod
+    def get(name):
+        if name.startswith("Caffe2Detectron/COCO"):
+            return ModelCatalog._get_c2_detectron_baseline(name)
+        if name.startswith("ImageNetPretrained/"):
+            return ModelCatalog._get_c2_imagenet_pretrained(name)
+        raise RuntimeError("model not present in the catalog: {}".format(name))
+    @staticmethod
+    def _get_c2_imagenet_pretrained(name):
+        prefix = ModelCatalog.S3_C2_DETECTRON_PREFIX
+        name = name[len("ImageNetPretrained/") :]
+        name = ModelCatalog.C2_IMAGENET_MODELS[name]
+        url = "/".join([prefix, name])
+        return url
+    @staticmethod
+    def _get_c2_detectron_baseline(name):
+        name = name[len("Caffe2Detectron/COCO/") :]
+        url = ModelCatalog.C2_DETECTRON_MODELS[name]
+        if "keypoint_rcnn" in name:
+            dataset = ModelCatalog.C2_DATASET_COCO_KEYPOINTS
+        else:
+            dataset = ModelCatalog.C2_DATASET_COCO
+        if "35998355/rpn_R-50-C4_1x" in name:
+            # this one model is somehow different from others ..
+            type = "rpn"
+        else:
+            type = "generalized_rcnn"
+        # Detectron C2 models are stored in the structure defined in `C2_DETECTRON_PATH_FORMAT`.
+        url = ModelCatalog.C2_DETECTRON_PATH_FORMAT.format(
+            prefix=ModelCatalog.S3_C2_DETECTRON_PREFIX, url=url, type=type, dataset=dataset
+        )
+        return url
+class ModelCatalogHandler(PathHandler):
+    """
+    Resolve URL like catalog://.
+    """
+    PREFIX = "catalog://"
+    def _get_supported_prefixes(self):
+        return [self.PREFIX]
+    def _get_local_path(self, path, **kwargs):
+        logger = logging.getLogger(__name__)
+        catalog_path = ModelCatalog.get(path[len(self.PREFIX) :])
+        logger.info("Catalog entry {} points to {}".format(path, catalog_path))
+        return PathManager.get_local_path(catalog_path, **kwargs)
+    def _open(self, path, mode="r", **kwargs):
+        return PathManager.open(self._get_local_path(path), mode, **kwargs)
+PathManager.register_handler(ModelCatalogHandler())
--- a/detectron2/checkpoint/detection_checkpoint.py
+++ b/detectron2/checkpoint/detection_checkpoint.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+import logging
+import os
+import pickle
+import torch
+from fvcore.common.checkpoint import Checkpointer
+from torch.nn.parallel import DistributedDataParallel
+import detectron2.utils.comm as comm
+from detectron2.utils.file_io import PathManager
+from .c2_model_loading import align_and_update_state_dicts
+class DetectionCheckpointer(Checkpointer):
+    """
+    Same as :class:`Checkpointer`, but is able to:
+    1. handle models in detectron & detectron2 model zoo, and apply conversions for legacy models.
+    2. correctly load checkpoints that are only available on the master worker
+    """
+    def __init__(self, model, save_dir="", *, save_to_disk=None, **checkpointables):
+        is_main_process = comm.is_main_process()
+        super().__init__(
+            model,
+            save_dir,
+            save_to_disk=is_main_process if save_to_disk is None else save_to_disk,
+            **checkpointables,
+        )
+        self.path_manager = PathManager
+    def load(self, path, *args, **kwargs):
+        need_sync = False
+        if path and isinstance(self.model, DistributedDataParallel):
+            logger = logging.getLogger(__name__)
+            path = self.path_manager.get_local_path(path)
+            has_file = os.path.isfile(path)
+            all_has_file = comm.all_gather(has_file)
+            if not all_has_file[0]:
+                raise OSError(f"File {path} not found on main worker.")
+            if not all(all_has_file):
+                logger.warning(
+                    f"Not all workers can read checkpoint {path}. "
+                    "Training may fail to fully resume."
+                )
+                # TODO: broadcast the checkpoint file contents from main
+                # worker, and load from it instead.
+                need_sync = True
+            if not has_file:
+                path = None  # don't load if not readable
+        ret = super().load(path, *args, **kwargs)
+        if need_sync:
+            logger.info("Broadcasting model states from main worker ...")
+            self.model._sync_params_and_buffers()
+        return ret
+    def _load_file(self, filename):
+        if filename.endswith(".pkl"):
+            with PathManager.open(filename, "rb") as f:
+                data = pickle.load(f, encoding="latin1")
+            if "model" in data and "__author__" in data:
+                # file is in Detectron2 model zoo format
+                self.logger.info("Reading a file from '{}'".format(data["__author__"]))
+                return data
+            else:
+                # assume file is from Caffe2 / Detectron1 model zoo
+                if "blobs" in data:
+                    # Detection models have "blobs", but ImageNet models don't
+                    data = data["blobs"]
+                data = {k: v for k, v in data.items() if not k.endswith("_momentum")}
+                return {"model": data, "__author__": "Caffe2", "matching_heuristics": True}
+        elif filename.endswith(".pyth"):
+            # assume file is from pycls; no one else seems to use the ".pyth" extension
+            with PathManager.open(filename, "rb") as f:
+                data = torch.load(f)
+            assert (
+                "model_state" in data
+            ), f"Cannot load .pyth file {filename}; pycls checkpoints must contain 'model_state'."
+            model_state = {
+                k: v
+                for k, v in data["model_state"].items()
+                if not k.endswith("num_batches_tracked")
+            }
+            return {"model": model_state, "__author__": "pycls", "matching_heuristics": True}
+        loaded = super()._load_file(filename)  # load native pth checkpoint
+        if "model" not in loaded:
+            loaded = {"model": loaded}
+        return loaded
+    def _load_model(self, checkpoint):
+        if checkpoint.get("matching_heuristics", False):
+            self._convert_ndarray_to_tensor(checkpoint["model"])
+            # convert weights by name-matching heuristics
+            checkpoint["model"] = align_and_update_state_dicts(
+                self.model.state_dict(),
+                checkpoint["model"],
+                c2_conversion=checkpoint.get("__author__", None) == "Caffe2",
+            )
+        # for non-caffe2 models, use standard ways to load it
+        incompatible = super()._load_model(checkpoint)
+        model_buffers = dict(self.model.named_buffers(recurse=False))
+        for k in ["pixel_mean", "pixel_std"]:
+            # Ignore missing key message about pixel_mean/std.
+            # Though they may be missing in old checkpoints, they will be correctly
+            # initialized from config anyway.
+            if k in model_buffers:
+                try:
+                    incompatible.missing_keys.remove(k)
+                except ValueError:
+                    pass
+        for k in incompatible.unexpected_keys[:]:
+            # Ignore unexpected keys about cell anchors. They exist in old checkpoints
+            # but now they are non-persistent buffers and will not be in new checkpoints.
+            if "anchor_generator.cell_anchors" in k:
+                incompatible.unexpected_keys.remove(k)
+        return incompatible
--- a/detectron2/config/__init__.py
+++ b/detectron2/config/__init__.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+from .compat import downgrade_config, upgrade_config
+from .config import CfgNode, get_cfg, global_cfg, set_global_cfg, configurable
+from .instantiate import instantiate
+from .lazy import LazyCall, LazyConfig
+__all__ = [
+    "CfgNode",
+    "get_cfg",
+    "global_cfg",
+    "set_global_cfg",
+    "downgrade_config",
+    "upgrade_config",
+    "configurable",
+    "instantiate",
+    "LazyCall",
+    "LazyConfig",
+]
+from detectron2.utils.env import fixup_module_metadata
+fixup_module_metadata(__name__, globals(), __all__)
+del fixup_module_metadata
--- a/detectron2/config/compat.py
+++ b/detectron2/config/compat.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+"""
+Backward compatibility of configs.
+Instructions to bump version:
+ It's not needed to bump version if new keys are added.
+  It's only needed when backward-incompatible changes happen
+  (i.e., some existing keys disappear, or the meaning of a key changes)
+ To bump version, do the following:
+    1. Increment _C.VERSION in defaults.py
+    2. Add a converter in this file.
+      Each ConverterVX has a function "upgrade" which in-place upgrades config from X-1 to X,
+      and a function "downgrade" which in-place downgrades config from X to X-1
+      In each function, VERSION is left unchanged.
+      Each converter assumes that its input has the relevant keys
+      (i.e., the input is not a partial config).
+    3. Run the tests (test_config.py) to make sure the upgrade & downgrade
+       functions are consistent.
+"""
+import logging
+from typing import List, Optional, Tuple
+from .config import CfgNode as CN
+from .defaults import _C
+__all__ = ["upgrade_config", "downgrade_config"]
+def upgrade_config(cfg: CN, to_version: Optional[int] = None) -> CN:
+    """
+    Upgrade a config from its current version to a newer version.
+    Args:
+        cfg (CfgNode):
+        to_version (int): defaults to the latest version.
+    """
+    cfg = cfg.clone()
+    if to_version is None:
+        to_version = _C.VERSION
+    assert cfg.VERSION <= to_version, "Cannot upgrade from v{} to v{}!".format(
+        cfg.VERSION, to_version
+    )
+    for k in range(cfg.VERSION, to_version):
+        converter = globals()["ConverterV" + str(k + 1)]
+        converter.upgrade(cfg)
+        cfg.VERSION = k + 1
+    return cfg
+def downgrade_config(cfg: CN, to_version: int) -> CN:
+    """
+    Downgrade a config from its current version to an older version.
+    Args:
+        cfg (CfgNode):
+        to_version (int):
+    Note:
+        A general downgrade of arbitrary configs is not always possible due to the
+        different functionalities in different versions.
+        The purpose of downgrade is only to recover the defaults in old versions,
+        allowing it to load an old partial yaml config.
+        Therefore, the implementation only needs to fill in the default values
+        in the old version when a general downgrade is not possible.
+    """
+    cfg = cfg.clone()
+    assert cfg.VERSION >= to_version, "Cannot downgrade from v{} to v{}!".format(
+        cfg.VERSION, to_version
+    )
+    for k in range(cfg.VERSION, to_version, -1):
+        converter = globals()["ConverterV" + str(k)]
+        converter.downgrade(cfg)
+        cfg.VERSION = k - 1
+    return cfg
+def guess_version(cfg: CN, filename: str) -> int:
+    """
+    Guess the version of a partial config where the VERSION field is not specified.
+    Returns the version, or the latest if cannot make a guess.
+    This makes it easier for users to migrate.
+    """
+    logger = logging.getLogger(__name__)
+    def _has(name: str) -> bool:
+        cur = cfg
+        for n in name.split("."):
+            if n not in cur:
+                return False
+            cur = cur[n]
+        return True
+    # Most users' partial configs have "MODEL.WEIGHT", so guess on it
+    ret = None
+    if _has("MODEL.WEIGHT") or _has("TEST.AUG_ON"):
+        ret = 1
+    if ret is not None:
+        logger.warning("Config '{}' has no VERSION. Assuming it to be v{}.".format(filename, ret))
+    else:
+        ret = _C.VERSION
+        logger.warning(
+            "Config '{}' has no VERSION. Assuming it to be compatible with latest v{}.".format(
+                filename, ret
+            )
+        )
+    return ret
+def _rename(cfg: CN, old: str, new: str) -> None:
+    old_keys = old.split(".")
+    new_keys = new.split(".")
+    def _set(key_seq: List[str], val: str) -> None:
+        cur = cfg
+        for k in key_seq[:-1]:
+            if k not in cur:
+                cur[k] = CN()
+            cur = cur[k]
+        cur[key_seq[-1]] = val
+    def _get(key_seq: List[str]) -> CN:
+        cur = cfg
+        for k in key_seq:
+            cur = cur[k]
+        return cur
+    def _del(key_seq: List[str]) -> None:
+        cur = cfg
+        for k in key_seq[:-1]:
+            cur = cur[k]
+        del cur[key_seq[-1]]
+        if len(cur) == 0 and len(key_seq) > 1:
+            _del(key_seq[:-1])
+    _set(new_keys, _get(old_keys))
+    _del(old_keys)
+class _RenameConverter:
+    """
+    A converter that handles simple rename.
+    """
+    RENAME: List[Tuple[str, str]] = []  # list of tuples of (old name, new name)
+    @classmethod
+    def upgrade(cls, cfg: CN) -> None:
+        for old, new in cls.RENAME:
+            _rename(cfg, old, new)
+    @classmethod
+    def downgrade(cls, cfg: CN) -> None:
+        for old, new in cls.RENAME[::-1]:
+            _rename(cfg, new, old)
+class ConverterV1(_RenameConverter):
+    RENAME = [("MODEL.RPN_HEAD.NAME", "MODEL.RPN.HEAD_NAME")]
+class ConverterV2(_RenameConverter):
+    """
+    A large bulk of rename, before public release.
+    """
+    RENAME = [
+        ("MODEL.WEIGHT", "MODEL.WEIGHTS"),
+        ("MODEL.PANOPTIC_FPN.SEMANTIC_LOSS_SCALE", "MODEL.SEM_SEG_HEAD.LOSS_WEIGHT"),
+        ("MODEL.PANOPTIC_FPN.RPN_LOSS_SCALE", "MODEL.RPN.LOSS_WEIGHT"),
+        ("MODEL.PANOPTIC_FPN.INSTANCE_LOSS_SCALE", "MODEL.PANOPTIC_FPN.INSTANCE_LOSS_WEIGHT"),
+        ("MODEL.PANOPTIC_FPN.COMBINE_ON", "MODEL.PANOPTIC_FPN.COMBINE.ENABLED"),
+        (
+            "MODEL.PANOPTIC_FPN.COMBINE_OVERLAP_THRESHOLD",
+            "MODEL.PANOPTIC_FPN.COMBINE.OVERLAP_THRESH",
+        ),
+        (
+            "MODEL.PANOPTIC_FPN.COMBINE_STUFF_AREA_LIMIT",
+            "MODEL.PANOPTIC_FPN.COMBINE.STUFF_AREA_LIMIT",
+        ),
+        (
+            "MODEL.PANOPTIC_FPN.COMBINE_INSTANCES_CONFIDENCE_THRESHOLD",
+            "MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH",
+        ),
+        ("MODEL.ROI_HEADS.SCORE_THRESH", "MODEL.ROI_HEADS.SCORE_THRESH_TEST"),
+        ("MODEL.ROI_HEADS.NMS", "MODEL.ROI_HEADS.NMS_THRESH_TEST"),
+        ("MODEL.RETINANET.INFERENCE_SCORE_THRESHOLD", "MODEL.RETINANET.SCORE_THRESH_TEST"),
+        ("MODEL.RETINANET.INFERENCE_TOPK_CANDIDATES", "MODEL.RETINANET.TOPK_CANDIDATES_TEST"),
+        ("MODEL.RETINANET.INFERENCE_NMS_THRESHOLD", "MODEL.RETINANET.NMS_THRESH_TEST"),
+        ("TEST.DETECTIONS_PER_IMG", "TEST.DETECTIONS_PER_IMAGE"),
+        ("TEST.AUG_ON", "TEST.AUG.ENABLED"),
+        ("TEST.AUG_MIN_SIZES", "TEST.AUG.MIN_SIZES"),
+        ("TEST.AUG_MAX_SIZE", "TEST.AUG.MAX_SIZE"),
+        ("TEST.AUG_FLIP", "TEST.AUG.FLIP"),
+    ]
+    @classmethod
+    def upgrade(cls, cfg: CN) -> None:
+        super().upgrade(cfg)
+        if cfg.MODEL.META_ARCHITECTURE == "RetinaNet":
+            _rename(
+                cfg, "MODEL.RETINANET.ANCHOR_ASPECT_RATIOS", "MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS"
+            )
+            _rename(cfg, "MODEL.RETINANET.ANCHOR_SIZES", "MODEL.ANCHOR_GENERATOR.SIZES")
+            del cfg["MODEL"]["RPN"]["ANCHOR_SIZES"]
+            del cfg["MODEL"]["RPN"]["ANCHOR_ASPECT_RATIOS"]
+        else:
+            _rename(cfg, "MODEL.RPN.ANCHOR_ASPECT_RATIOS", "MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS")
+            _rename(cfg, "MODEL.RPN.ANCHOR_SIZES", "MODEL.ANCHOR_GENERATOR.SIZES")
+            del cfg["MODEL"]["RETINANET"]["ANCHOR_SIZES"]
+            del cfg["MODEL"]["RETINANET"]["ANCHOR_ASPECT_RATIOS"]
+        del cfg["MODEL"]["RETINANET"]["ANCHOR_STRIDES"]
+    @classmethod
+    def downgrade(cls, cfg: CN) -> None:
+        super().downgrade(cfg)
+        _rename(cfg, "MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS", "MODEL.RPN.ANCHOR_ASPECT_RATIOS")
+        _rename(cfg, "MODEL.ANCHOR_GENERATOR.SIZES", "MODEL.RPN.ANCHOR_SIZES")
+        cfg.MODEL.RETINANET.ANCHOR_ASPECT_RATIOS = cfg.MODEL.RPN.ANCHOR_ASPECT_RATIOS
+        cfg.MODEL.RETINANET.ANCHOR_SIZES = cfg.MODEL.RPN.ANCHOR_SIZES
+        cfg.MODEL.RETINANET.ANCHOR_STRIDES = []  # this is not used anywhere in any version
--- a/detectron2/config/config.py
+++ b/detectron2/config/config.py
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+import functools
+import inspect
+import logging
+from fvcore.common.config import CfgNode as _CfgNode
+from detectron2.utils.file_io import PathManager
+class CfgNode(_CfgNode):
+    """
+    The same as `fvcore.common.config.CfgNode`, but different in:
+    1. Use unsafe yaml loading by default.
+       Note that this may lead to arbitrary code execution: you must not
+       load a config file from untrusted sources before manually inspecting
+       the content of the file.
+    2. Support config versioning.
+       When attempting to merge an old config, it will convert the old config automatically.
+    .. automethod:: clone
+    .. automethod:: freeze
+    .. automethod:: defrost
+    .. automethod:: is_frozen
+    .. automethod:: load_yaml_with_base
+    .. automethod:: merge_from_list
+    .. automethod:: merge_from_other_cfg
+    """
+    @classmethod
+    def _open_cfg(cls, filename):
+        return PathManager.open(filename, "r")
+    # Note that the default value of allow_unsafe is changed to True
+    def merge_from_file(self, cfg_filename: str, allow_unsafe: bool = True) -> None:
+        """
+        Load content from the given config file and merge it into self.
+        Args:
+            cfg_filename: config filename
+            allow_unsafe: allow unsafe yaml syntax
+        """
+        assert PathManager.isfile(cfg_filename), f"Config file '{cfg_filename}' does not exist!"
+        loaded_cfg = self.load_yaml_with_base(cfg_filename, allow_unsafe=allow_unsafe)
+        loaded_cfg = type(self)(loaded_cfg)
+        # defaults.py needs to import CfgNode
+        from .defaults import _C
+        latest_ver = _C.VERSION
+        assert (
+            latest_ver == self.VERSION
+        ), "CfgNode.merge_from_file is only allowed on a config object of latest version!"
+        logger = logging.getLogger(__name__)
+        loaded_ver = loaded_cfg.get("VERSION", None)
+        if loaded_ver is None:
+            from .compat import guess_version
+            loaded_ver = guess_version(loaded_cfg, cfg_filename)
+        assert loaded_ver <= self.VERSION, "Cannot merge a v{} config into a v{} config.".format(
+            loaded_ver, self.VERSION
+        )
+        if loaded_ver == self.VERSION:
+            self.merge_from_other_cfg(loaded_cfg)
+        else:
+            # compat.py needs to import CfgNode
+            from .compat import upgrade_config, downgrade_config
+            logger.warning(
+                "Loading an old v{} config file '{}' by automatically upgrading to v{}. "
+                "See docs/CHANGELOG.md for instructions to update your files.".format(
+                    loaded_ver, cfg_filename, self.VERSION
+                )
+            )
+            # To convert, first obtain a full config at an old version
+            old_self = downgrade_config(self, to_version=loaded_ver)
+            old_self.merge_from_other_cfg(loaded_cfg)
+            new_config = upgrade_config(old_self)
+            self.clear()
+            self.update(new_config)
+    def dump(self, *args, **kwargs):
+        """
+        Returns:
+            str: a yaml string representation of the config
+        """
+        # to make it show up in docs
+        return super().dump(*args, **kwargs)
+global_cfg = CfgNode()
+def get_cfg() -> CfgNode:
+    """
+    Get a copy of the default config.
+    Returns:
+        a detectron2 CfgNode instance.
+    """
+    from .defaults import _C
+    return _C.clone()
+def set_global_cfg(cfg: CfgNode) -> None:
+    """
+    Let the global config point to the given cfg.
+    Assume that the given "cfg" has the key "KEY", after calling
+    `set_global_cfg(cfg)`, the key can be accessed by:
+    ::
+        from detectron2.config import global_cfg
+        print(global_cfg.KEY)
+    By using a hacky global config, you can access these configs anywhere,
+    without having to pass the config object or the values deep into the code.
+    This is a hacky feature introduced for quick prototyping / research exploration.
+    """
+    global global_cfg
+    global_cfg.clear()
+    global_cfg.update(cfg)
+def configurable(init_func=None, *, from_config=None):
+    """
+    Decorate a function or a class's __init__ method so that it can be called
+    with a :class:`CfgNode` object using a :func:`from_config` function that translates
+    :class:`CfgNode` to arguments.
+    Examples:
+    ::
+        # Usage 1: Decorator on __init__:
+        class A:
+            @configurable
+            def __init__(self, a, b=2, c=3):
+                pass
+            @classmethod
+            def from_config(cls, cfg):   # 'cfg' must be the first argument
+                # Returns kwargs to be passed to __init__
+                return {"a": cfg.A, "b": cfg.B}
+        a1 = A(a=1, b=2)  # regular construction
+        a2 = A(cfg)       # construct with a cfg
+        a3 = A(cfg, b=3, c=4)  # construct with extra overwrite
+        # Usage 2: Decorator on any function. Needs an extra from_config argument:
+        @configurable(from_config=lambda cfg: {"a: cfg.A, "b": cfg.B})
+        def a_func(a, b=2, c=3):
+            pass
+        a1 = a_func(a=1, b=2)  # regular call
+        a2 = a_func(cfg)       # call with a cfg
+        a3 = a_func(cfg, b=3, c=4)  # call with extra overwrite
+    Args:
+        init_func (callable): a class's ``__init__`` method in usage 1. The
+            class must have a ``from_config`` classmethod which takes `cfg` as
+            the first argument.
+        from_config (callable): the from_config function in usage 2. It must take `cfg`
+            as its first argument.
+    """
+    if init_func is not None:
+        assert (
+            inspect.isfunction(init_func)
+            and from_config is None
+            and init_func.__name__ == "__init__"
+        ), "Incorrect use of @configurable. Check API documentation for examples."
+        @functools.wraps(init_func)
+        def wrapped(self, *args, **kwargs):
+            try:
+                from_config_func = type(self).from_config
+            except AttributeError as e:
+                raise AttributeError(
+                    "Class with @configurable must have a 'from_config' classmethod."
+                ) from e
+            if not inspect.ismethod(from_config_func):
+                raise TypeError("Class with @configurable must have a 'from_config' classmethod.")
+            if _called_with_cfg(*args, **kwargs):
+                explicit_args = _get_args_from_config(from_config_func, *args, **kwargs)
+                init_func(self, **explicit_args)
+            else:
+                init_func(self, *args, **kwargs)
+        return wrapped
+    else:
+        if from_config is None:
+            return configurable  # @configurable() is made equivalent to @configurable
+        assert inspect.isfunction(
+            from_config
+        ), "from_config argument of configurable must be a function!"
+        def wrapper(orig_func):
+            @functools.wraps(orig_func)
+            def wrapped(*args, **kwargs):
+                if _called_with_cfg(*args, **kwargs):
+                    explicit_args = _get_args_from_config(from_config, *args, **kwargs)
+                    return orig_func(**explicit_args)
+                else:
+                    return orig_func(*args, **kwargs)
+            wrapped.from_config = from_config
+            return wrapped
+        return wrapper
+def _get_args_from_config(from_config_func, *args, **kwargs):
+    """
+    Use `from_config` to obtain explicit arguments.
+    Returns:
+        dict: arguments to be used for cls.__init__
+    """
+    signature = inspect.signature(from_config_func)
+    if list(signature.parameters.keys())[0] != "cfg":
+        if inspect.isfunction(from_config_func):
+            name = from_config_func.__name__
+        else:
+            name = f"{from_config_func.__self__}.from_config"
+        raise TypeError(f"{name} must take 'cfg' as the first argument!")
+    support_var_arg = any(
+        param.kind in [param.VAR_POSITIONAL, param.VAR_KEYWORD]
+        for param in signature.parameters.values()
+    )
+    if support_var_arg:  # forward all arguments to from_config, if from_config accepts them
+        ret = from_config_func(*args, **kwargs)
+    else:
+        # forward supported arguments to from_config
+        supported_arg_names = set(signature.parameters.keys())
+        extra_kwargs = {}
+        for name in list(kwargs.keys()):
+            if name not in supported_arg_names:
+                extra_kwargs[name] = kwargs.pop(name)
+        ret = from_config_func(*args, **kwargs)
+        # forward the other arguments to __init__
+        ret.update(extra_kwargs)
+    return ret
+def _called_with_cfg(*args, **kwargs):
+    """
+    Returns:
+        bool: whether the arguments contain CfgNode and should be considered
+            forwarded to from_config.
+    """
+    from omegaconf import DictConfig
+    if len(args) and isinstance(args[0], (_CfgNode, DictConfig)):
+        return True
+    if isinstance(kwargs.pop("cfg", None), (_CfgNode, DictConfig)):
+        return True
+    # `from_config`'s first argument is forced to be "cfg".
+    # So the above check covers all cases.
+    return False
--- a/detectron2/config/defaults.py
+++ b/detectron2/config/defaults.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+from .config import CfgNode as CN
+# NOTE: given the new config system
+# (https://detectron2.readthedocs.io/en/latest/tutorials/lazyconfigs.html),
+# we will stop adding new functionalities to default CfgNode.
+# -----------------------------------------------------------------------------
+# Convention about Training / Test specific parameters
+# -----------------------------------------------------------------------------
+# Whenever an argument can be either used for training or for testing, the
+# corresponding name will be post-fixed by a _TRAIN for a training parameter,
+# or _TEST for a test-specific parameter.
+# For example, the number of images during training will be
+# IMAGES_PER_BATCH_TRAIN, while the number of images for testing will be
+# IMAGES_PER_BATCH_TEST
+# -----------------------------------------------------------------------------
+# Config definition
+# -----------------------------------------------------------------------------
+_C = CN()
+# The version number, to upgrade from old configs to new ones if any
+# changes happen. It's recommended to keep a VERSION in your config file.
+_C.VERSION = 2
+_C.MODEL = CN()
+_C.MODEL.LOAD_PROPOSALS = False
+_C.MODEL.MASK_ON = False
+_C.MODEL.KEYPOINT_ON = False
+_C.MODEL.DEVICE = "cuda"
+_C.MODEL.META_ARCHITECTURE = "GeneralizedRCNN"
+# Path (a file path, or URL like detectron2://.., https://..) to a checkpoint file
+# to be loaded to the model. You can find available models in the model zoo.
+_C.MODEL.WEIGHTS = ""
+# Values to be used for image normalization (BGR order, since INPUT.FORMAT defaults to BGR).
+# To train on images of different number of channels, just set different mean & std.
+# Default values are the mean pixel value from ImageNet: [103.53, 116.28, 123.675]
+_C.MODEL.PIXEL_MEAN = [103.530, 116.280, 123.675]
+# When using pre-trained models in Detectron1 or any MSRA models,
+# std has been absorbed into its conv1 weights, so the std needs to be set 1.
+# Otherwise, you can use [57.375, 57.120, 58.395] (ImageNet std)
+_C.MODEL.PIXEL_STD = [1.0, 1.0, 1.0]
+# -----------------------------------------------------------------------------
+# INPUT
+# -----------------------------------------------------------------------------
+_C.INPUT = CN()
+# By default, {MIN,MAX}_SIZE options are used in transforms.ResizeShortestEdge.
+# Please refer to ResizeShortestEdge for detailed definition.
+# Size of the smallest side of the image during training
+_C.INPUT.MIN_SIZE_TRAIN = (800,)
+# Sample size of smallest side by choice or random selection from range give by
+# INPUT.MIN_SIZE_TRAIN
+_C.INPUT.MIN_SIZE_TRAIN_SAMPLING = "choice"
+# Maximum size of the side of the image during training
+_C.INPUT.MAX_SIZE_TRAIN = 1333
+# Size of the smallest side of the image during testing. Set to zero to disable resize in testing.
+_C.INPUT.MIN_SIZE_TEST = 800
+# Maximum size of the side of the image during testing
+_C.INPUT.MAX_SIZE_TEST = 1333
+# Mode for flipping images used in data augmentation during training
+# choose one of ["horizontal, "vertical", "none"]
+_C.INPUT.RANDOM_FLIP = "horizontal"
+# `True` if cropping is used for data augmentation during training
+_C.INPUT.CROP = CN({"ENABLED": False})
+# Cropping type. See documentation of `detectron2.data.transforms.RandomCrop` for explanation.
+_C.INPUT.CROP.TYPE = "relative_range"
+# Size of crop in range (0, 1] if CROP.TYPE is "relative" or "relative_range" and in number of
+# pixels if CROP.TYPE is "absolute"
+_C.INPUT.CROP.SIZE = [0.9, 0.9]
+# Whether the model needs RGB, YUV, HSV etc.
+# Should be one of the modes defined here, as we use PIL to read the image:
+# https://pillow.readthedocs.io/en/stable/handbook/concepts.html#concept-modes
+# with BGR being the one exception. One can set image format to BGR, we will
+# internally use RGB for conversion and flip the channels over
+_C.INPUT.FORMAT = "BGR"
+# The ground truth mask format that the model will use.
+# Mask R-CNN supports either "polygon" or "bitmask" as ground truth.
+_C.INPUT.MASK_FORMAT = "polygon"  # alternative: "bitmask"
+# -----------------------------------------------------------------------------
+# Dataset
+# -----------------------------------------------------------------------------
+_C.DATASETS = CN()
+# List of the dataset names for training. Must be registered in DatasetCatalog
+# Samples from these datasets will be merged and used as one dataset.
+_C.DATASETS.TRAIN = ()
+# List of the pre-computed proposal files for training, which must be consistent
+# with datasets listed in DATASETS.TRAIN.
+_C.DATASETS.PROPOSAL_FILES_TRAIN = ()
+# Number of top scoring precomputed proposals to keep for training
+_C.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TRAIN = 2000
+# List of the dataset names for testing. Must be registered in DatasetCatalog
+_C.DATASETS.TEST = ()
+# List of the pre-computed proposal files for test, which must be consistent
+# with datasets listed in DATASETS.TEST.
+_C.DATASETS.PROPOSAL_FILES_TEST = ()
+# Number of top scoring precomputed proposals to keep for test
+_C.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TEST = 1000
+# -----------------------------------------------------------------------------
+# DataLoader
+# -----------------------------------------------------------------------------
+_C.DATALOADER = CN()
+# Number of data loading threads
+_C.DATALOADER.NUM_WORKERS = 4
+# If True, each batch should contain only images for which the aspect ratio
+# is compatible. This groups portrait images together, and landscape images
+# are not batched with portrait images.
+_C.DATALOADER.ASPECT_RATIO_GROUPING = True
+# Options: TrainingSampler, RepeatFactorTrainingSampler
+_C.DATALOADER.SAMPLER_TRAIN = "TrainingSampler"
+# Repeat threshold for RepeatFactorTrainingSampler
+_C.DATALOADER.REPEAT_THRESHOLD = 0.0
+# Tf True, when working on datasets that have instance annotations, the
+# training dataloader will filter out images without associated annotations
+_C.DATALOADER.FILTER_EMPTY_ANNOTATIONS = True
+# ---------------------------------------------------------------------------- #
+# Backbone options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.BACKBONE = CN()
+_C.MODEL.BACKBONE.NAME = "build_resnet_backbone"
+# Freeze the first several stages so they are not trained.
+# There are 5 stages in ResNet. The first is a convolution, and the following
+# stages are each group of residual blocks.
+_C.MODEL.BACKBONE.FREEZE_AT = 2
+# ---------------------------------------------------------------------------- #
+# FPN options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.FPN = CN()
+# Names of the input feature maps to be used by FPN
+# They must have contiguous power of 2 strides
+# e.g., ["res2", "res3", "res4", "res5"]
+_C.MODEL.FPN.IN_FEATURES = []
+_C.MODEL.FPN.OUT_CHANNELS = 256
+# Options: "" (no norm), "GN"
+_C.MODEL.FPN.NORM = ""
+# Types for fusing the FPN top-down and lateral features. Can be either "sum" or "avg"
+_C.MODEL.FPN.FUSE_TYPE = "sum"
+# ---------------------------------------------------------------------------- #
+# Proposal generator options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.PROPOSAL_GENERATOR = CN()
+# Current proposal generators include "RPN", "RRPN" and "PrecomputedProposals"
+_C.MODEL.PROPOSAL_GENERATOR.NAME = "RPN"
+# Proposal height and width both need to be greater than MIN_SIZE
+# (a the scale used during training or inference)
+_C.MODEL.PROPOSAL_GENERATOR.MIN_SIZE = 0
+# ---------------------------------------------------------------------------- #
+# Anchor generator options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.ANCHOR_GENERATOR = CN()
+# The generator can be any name in the ANCHOR_GENERATOR registry
+_C.MODEL.ANCHOR_GENERATOR.NAME = "DefaultAnchorGenerator"
+# Anchor sizes (i.e. sqrt of area) in absolute pixels w.r.t. the network input.
+# Format: list[list[float]]. SIZES[i] specifies the list of sizes to use for
+# IN_FEATURES[i]; len(SIZES) must be equal to len(IN_FEATURES) or 1.
+# When len(SIZES) == 1, SIZES[0] is used for all IN_FEATURES.
+_C.MODEL.ANCHOR_GENERATOR.SIZES = [[32, 64, 128, 256, 512]]
+# Anchor aspect ratios. For each area given in `SIZES`, anchors with different aspect
+# ratios are generated by an anchor generator.
+# Format: list[list[float]]. ASPECT_RATIOS[i] specifies the list of aspect ratios (H/W)
+# to use for IN_FEATURES[i]; len(ASPECT_RATIOS) == len(IN_FEATURES) must be true,
+# or len(ASPECT_RATIOS) == 1 is true and aspect ratio list ASPECT_RATIOS[0] is used
+# for all IN_FEATURES.
+_C.MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS = [[0.5, 1.0, 2.0]]
+# Anchor angles.
+# list[list[float]], the angle in degrees, for each input feature map.
+# ANGLES[i] specifies the list of angles for IN_FEATURES[i].
+_C.MODEL.ANCHOR_GENERATOR.ANGLES = [[-90, 0, 90]]
+# Relative offset between the center of the first anchor and the top-left corner of the image
+# Value has to be in [0, 1). Recommend to use 0.5, which means half stride.
+# The value is not expected to affect model accuracy.
+_C.MODEL.ANCHOR_GENERATOR.OFFSET = 0.0
+# ---------------------------------------------------------------------------- #
+# RPN options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.RPN = CN()
+_C.MODEL.RPN.HEAD_NAME = "StandardRPNHead"  # used by RPN_HEAD_REGISTRY
+# Names of the input feature maps to be used by RPN
+# e.g., ["p2", "p3", "p4", "p5", "p6"] for FPN
+_C.MODEL.RPN.IN_FEATURES = ["res4"]
+# Remove RPN anchors that go outside the image by BOUNDARY_THRESH pixels
+# Set to -1 or a large value, e.g. 100000, to disable pruning anchors
+_C.MODEL.RPN.BOUNDARY_THRESH = -1
+# IOU overlap ratios [BG_IOU_THRESHOLD, FG_IOU_THRESHOLD]
+# Minimum overlap required between an anchor and ground-truth box for the
+# (anchor, gt box) pair to be a positive example (IoU >= FG_IOU_THRESHOLD
+# ==> positive RPN example: 1)
+# Maximum overlap allowed between an anchor and ground-truth box for the
+# (anchor, gt box) pair to be a negative examples (IoU < BG_IOU_THRESHOLD
+# ==> negative RPN example: 0)
+# Anchors with overlap in between (BG_IOU_THRESHOLD <= IoU < FG_IOU_THRESHOLD)
+# are ignored (-1)
+_C.MODEL.RPN.IOU_THRESHOLDS = [0.3, 0.7]
+_C.MODEL.RPN.IOU_LABELS = [0, -1, 1]
+# Number of regions per image used to train RPN
+_C.MODEL.RPN.BATCH_SIZE_PER_IMAGE = 256
+# Target fraction of foreground (positive) examples per RPN minibatch
+_C.MODEL.RPN.POSITIVE_FRACTION = 0.5
+# Options are: "smooth_l1", "giou", "diou", "ciou"
+_C.MODEL.RPN.BBOX_REG_LOSS_TYPE = "smooth_l1"
+_C.MODEL.RPN.BBOX_REG_LOSS_WEIGHT = 1.0
+# Weights on (dx, dy, dw, dh) for normalizing RPN anchor regression targets
+_C.MODEL.RPN.BBOX_REG_WEIGHTS = (1.0, 1.0, 1.0, 1.0)
+# The transition point from L1 to L2 loss. Set to 0.0 to make the loss simply L1.
+_C.MODEL.RPN.SMOOTH_L1_BETA = 0.0
+_C.MODEL.RPN.LOSS_WEIGHT = 1.0
+# Number of top scoring RPN proposals to keep before applying NMS
+# When FPN is used, this is *per FPN level* (not total)
+_C.MODEL.RPN.PRE_NMS_TOPK_TRAIN = 12000
+_C.MODEL.RPN.PRE_NMS_TOPK_TEST = 6000
+# Number of top scoring RPN proposals to keep after applying NMS
+# When FPN is used, this limit is applied per level and then again to the union
+# of proposals from all levels
+# NOTE: When FPN is used, the meaning of this config is different from Detectron1.
+# It means per-batch topk in Detectron1, but per-image topk here.
+# See the "find_top_rpn_proposals" function for details.
+_C.MODEL.RPN.POST_NMS_TOPK_TRAIN = 2000
+_C.MODEL.RPN.POST_NMS_TOPK_TEST = 1000
+# NMS threshold used on RPN proposals
+_C.MODEL.RPN.NMS_THRESH = 0.7
+# Set this to -1 to use the same number of output channels as input channels.
+_C.MODEL.RPN.CONV_DIMS = [-1]
+# ---------------------------------------------------------------------------- #
+# ROI HEADS options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.ROI_HEADS = CN()
+_C.MODEL.ROI_HEADS.NAME = "Res5ROIHeads"
+# Number of foreground classes
+_C.MODEL.ROI_HEADS.NUM_CLASSES = 80
+# Names of the input feature maps to be used by ROI heads
+# Currently all heads (box, mask, ...) use the same input feature map list
+# e.g., ["p2", "p3", "p4", "p5"] is commonly used for FPN
+_C.MODEL.ROI_HEADS.IN_FEATURES = ["res4"]
+# IOU overlap ratios [IOU_THRESHOLD]
+# Overlap threshold for an RoI to be considered background (if < IOU_THRESHOLD)
+# Overlap threshold for an RoI to be considered foreground (if >= IOU_THRESHOLD)
+_C.MODEL.ROI_HEADS.IOU_THRESHOLDS = [0.5]
+_C.MODEL.ROI_HEADS.IOU_LABELS = [0, 1]
+# RoI minibatch size *per image* (number of regions of interest [ROIs]) during training
+# Total number of RoIs per training minibatch =
+#   ROI_HEADS.BATCH_SIZE_PER_IMAGE * SOLVER.IMS_PER_BATCH
+# E.g., a common configuration is: 512 * 16 = 8192
+_C.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 512
+# Target fraction of RoI minibatch that is labeled foreground (i.e. class > 0)
+_C.MODEL.ROI_HEADS.POSITIVE_FRACTION = 0.25
+# Only used on test mode
+# Minimum score threshold (assuming scores in a [0, 1] range); a value chosen to
+# balance obtaining high recall with not having too many low precision
+# detections that will slow down inference post processing steps (like NMS)
+# A default threshold of 0.0 increases AP by ~0.2-0.3 but significantly slows down
+# inference.
+_C.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.05
+# Overlap threshold used for non-maximum suppression (suppress boxes with
+# IoU >= this threshold)
+_C.MODEL.ROI_HEADS.NMS_THRESH_TEST = 0.5
+# If True, augment proposals with ground-truth boxes before sampling proposals to
+# train ROI heads.
+_C.MODEL.ROI_HEADS.PROPOSAL_APPEND_GT = True
+# ---------------------------------------------------------------------------- #
+# Box Head
+# ---------------------------------------------------------------------------- #
+_C.MODEL.ROI_BOX_HEAD = CN()
+# C4 don't use head name option
+# Options for non-C4 models: FastRCNNConvFCHead,
+_C.MODEL.ROI_BOX_HEAD.NAME = ""
+# Options are: "smooth_l1", "giou", "diou", "ciou"
+_C.MODEL.ROI_BOX_HEAD.BBOX_REG_LOSS_TYPE = "smooth_l1"
+# The final scaling coefficient on the box regression loss, used to balance the magnitude of its
+# gradients with other losses in the model. See also `MODEL.ROI_KEYPOINT_HEAD.LOSS_WEIGHT`.
+_C.MODEL.ROI_BOX_HEAD.BBOX_REG_LOSS_WEIGHT = 1.0
+# Default weights on (dx, dy, dw, dh) for normalizing bbox regression targets
+# These are empirically chosen to approximately lead to unit variance targets
+_C.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS = (10.0, 10.0, 5.0, 5.0)
+# The transition point from L1 to L2 loss. Set to 0.0 to make the loss simply L1.
+_C.MODEL.ROI_BOX_HEAD.SMOOTH_L1_BETA = 0.0
+_C.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION = 14
+_C.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO = 0
+# Type of pooling operation applied to the incoming feature map for each RoI
+_C.MODEL.ROI_BOX_HEAD.POOLER_TYPE = "ROIAlignV2"
+_C.MODEL.ROI_BOX_HEAD.NUM_FC = 0
+# Hidden layer dimension for FC layers in the RoI box head
+_C.MODEL.ROI_BOX_HEAD.FC_DIM = 1024
+_C.MODEL.ROI_BOX_HEAD.NUM_CONV = 0
+# Channel dimension for Conv layers in the RoI box head
+_C.MODEL.ROI_BOX_HEAD.CONV_DIM = 256
+# Normalization method for the convolution layers.
+# Options: "" (no norm), "GN", "SyncBN".
+_C.MODEL.ROI_BOX_HEAD.NORM = ""
+# Whether to use class agnostic for bbox regression
+_C.MODEL.ROI_BOX_HEAD.CLS_AGNOSTIC_BBOX_REG = False
+# If true, RoI heads use bounding boxes predicted by the box head rather than proposal boxes.
+_C.MODEL.ROI_BOX_HEAD.TRAIN_ON_PRED_BOXES = False
+# ---------------------------------------------------------------------------- #
+# Cascaded Box Head
+# ---------------------------------------------------------------------------- #
+_C.MODEL.ROI_BOX_CASCADE_HEAD = CN()
+# The number of cascade stages is implicitly defined by the length of the following two configs.
+_C.MODEL.ROI_BOX_CASCADE_HEAD.BBOX_REG_WEIGHTS = (
+    (10.0, 10.0, 5.0, 5.0),
+    (20.0, 20.0, 10.0, 10.0),
+    (30.0, 30.0, 15.0, 15.0),
+)
+_C.MODEL.ROI_BOX_CASCADE_HEAD.IOUS = (0.5, 0.6, 0.7)
+# ---------------------------------------------------------------------------- #
+# Mask Head
+# ---------------------------------------------------------------------------- #
+_C.MODEL.ROI_MASK_HEAD = CN()
+_C.MODEL.ROI_MASK_HEAD.NAME = "MaskRCNNConvUpsampleHead"
+_C.MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION = 14
+_C.MODEL.ROI_MASK_HEAD.POOLER_SAMPLING_RATIO = 0
+_C.MODEL.ROI_MASK_HEAD.NUM_CONV = 0  # The number of convs in the mask head
+_C.MODEL.ROI_MASK_HEAD.CONV_DIM = 256
+# Normalization method for the convolution layers.
+# Options: "" (no norm), "GN", "SyncBN".
+_C.MODEL.ROI_MASK_HEAD.NORM = ""
+# Whether to use class agnostic for mask prediction
+_C.MODEL.ROI_MASK_HEAD.CLS_AGNOSTIC_MASK = False
+# Type of pooling operation applied to the incoming feature map for each RoI
+_C.MODEL.ROI_MASK_HEAD.POOLER_TYPE = "ROIAlignV2"
+# ---------------------------------------------------------------------------- #
+# Keypoint Head
+# ---------------------------------------------------------------------------- #
+_C.MODEL.ROI_KEYPOINT_HEAD = CN()
+_C.MODEL.ROI_KEYPOINT_HEAD.NAME = "KRCNNConvDeconvUpsampleHead"
+_C.MODEL.ROI_KEYPOINT_HEAD.POOLER_RESOLUTION = 14
+_C.MODEL.ROI_KEYPOINT_HEAD.POOLER_SAMPLING_RATIO = 0
+_C.MODEL.ROI_KEYPOINT_HEAD.CONV_DIMS = tuple(512 for _ in range(8))
+_C.MODEL.ROI_KEYPOINT_HEAD.NUM_KEYPOINTS = 17  # 17 is the number of keypoints in COCO.
+# Images with too few (or no) keypoints are excluded from training.
+_C.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE = 1
+# Normalize by the total number of visible keypoints in the minibatch if True.
+# Otherwise, normalize by the total number of keypoints that could ever exist
+# in the minibatch.
+# The keypoint softmax loss is only calculated on visible keypoints.
+# Since the number of visible keypoints can vary significantly between
+# minibatches, this has the effect of up-weighting the importance of
+# minibatches with few visible keypoints. (Imagine the extreme case of
+# only one visible keypoint versus N: in the case of N, each one
+# contributes 1/N to the gradient compared to the single keypoint
+# determining the gradient direction). Instead, we can normalize the
+# loss by the total number of keypoints, if it were the case that all
+# keypoints were visible in a full minibatch. (Returning to the example,
+# this means that the one visible keypoint contributes as much as each
+# of the N keypoints.)
+_C.MODEL.ROI_KEYPOINT_HEAD.NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS = True
+# Multi-task loss weight to use for keypoints
+# Recommended values:
+#   - use 1.0 if NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS is True
+#   - use 4.0 if NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS is False
+_C.MODEL.ROI_KEYPOINT_HEAD.LOSS_WEIGHT = 1.0
+# Type of pooling operation applied to the incoming feature map for each RoI
+_C.MODEL.ROI_KEYPOINT_HEAD.POOLER_TYPE = "ROIAlignV2"
+# ---------------------------------------------------------------------------- #
+# Semantic Segmentation Head
+# ---------------------------------------------------------------------------- #
+_C.MODEL.SEM_SEG_HEAD = CN()
+_C.MODEL.SEM_SEG_HEAD.NAME = "SemSegFPNHead"
+_C.MODEL.SEM_SEG_HEAD.IN_FEATURES = ["p2", "p3", "p4", "p5"]
+# Label in the semantic segmentation ground truth that is ignored, i.e., no loss is calculated for
+# the correposnding pixel.
+_C.MODEL.SEM_SEG_HEAD.IGNORE_VALUE = 255
+# Number of classes in the semantic segmentation head
+_C.MODEL.SEM_SEG_HEAD.NUM_CLASSES = 54
+# Number of channels in the 3x3 convs inside semantic-FPN heads.
+_C.MODEL.SEM_SEG_HEAD.CONVS_DIM = 128
+# Outputs from semantic-FPN heads are up-scaled to the COMMON_STRIDE stride.
+_C.MODEL.SEM_SEG_HEAD.COMMON_STRIDE = 4
+# Normalization method for the convolution layers. Options: "" (no norm), "GN".
+_C.MODEL.SEM_SEG_HEAD.NORM = "GN"
+_C.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT = 1.0
+_C.MODEL.PANOPTIC_FPN = CN()
+# Scaling of all losses from instance detection / segmentation head.
+_C.MODEL.PANOPTIC_FPN.INSTANCE_LOSS_WEIGHT = 1.0
+# options when combining instance & semantic segmentation outputs
+_C.MODEL.PANOPTIC_FPN.COMBINE = CN({"ENABLED": True})  # "COMBINE.ENABLED" is deprecated & not used
+_C.MODEL.PANOPTIC_FPN.COMBINE.OVERLAP_THRESH = 0.5
+_C.MODEL.PANOPTIC_FPN.COMBINE.STUFF_AREA_LIMIT = 4096
+_C.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH = 0.5
+# ---------------------------------------------------------------------------- #
+# RetinaNet Head
+# ---------------------------------------------------------------------------- #
+_C.MODEL.RETINANET = CN()
+# This is the number of foreground classes.
+_C.MODEL.RETINANET.NUM_CLASSES = 80
+_C.MODEL.RETINANET.IN_FEATURES = ["p3", "p4", "p5", "p6", "p7"]
+# Convolutions to use in the cls and bbox tower
+# NOTE: this doesn't include the last conv for logits
+_C.MODEL.RETINANET.NUM_CONVS = 4
+# IoU overlap ratio [bg, fg] for labeling anchors.
+# Anchors with < bg are labeled negative (0)
+# Anchors  with >= bg and < fg are ignored (-1)
+# Anchors with >= fg are labeled positive (1)
+_C.MODEL.RETINANET.IOU_THRESHOLDS = [0.4, 0.5]
+_C.MODEL.RETINANET.IOU_LABELS = [0, -1, 1]
+# Prior prob for rare case (i.e. foreground) at the beginning of training.
+# This is used to set the bias for the logits layer of the classifier subnet.
+# This improves training stability in the case of heavy class imbalance.
+_C.MODEL.RETINANET.PRIOR_PROB = 0.01
+# Inference cls score threshold, only anchors with score > INFERENCE_TH are
+# considered for inference (to improve speed)
+_C.MODEL.RETINANET.SCORE_THRESH_TEST = 0.05
+# Select topk candidates before NMS
+_C.MODEL.RETINANET.TOPK_CANDIDATES_TEST = 1000
+_C.MODEL.RETINANET.NMS_THRESH_TEST = 0.5
+# Weights on (dx, dy, dw, dh) for normalizing Retinanet anchor regression targets
+_C.MODEL.RETINANET.BBOX_REG_WEIGHTS = (1.0, 1.0, 1.0, 1.0)
+# Loss parameters
+_C.MODEL.RETINANET.FOCAL_LOSS_GAMMA = 2.0
+_C.MODEL.RETINANET.FOCAL_LOSS_ALPHA = 0.25
+_C.MODEL.RETINANET.SMOOTH_L1_LOSS_BETA = 0.1
+# Options are: "smooth_l1", "giou", "diou", "ciou"
+_C.MODEL.RETINANET.BBOX_REG_LOSS_TYPE = "smooth_l1"
+# One of BN, SyncBN, FrozenBN, GN
+# Only supports GN until unshared norm is implemented
+_C.MODEL.RETINANET.NORM = ""
+# ---------------------------------------------------------------------------- #
+# ResNe[X]t options (ResNets = {ResNet, ResNeXt}
+# Note that parts of a resnet may be used for both the backbone and the head
+# These options apply to both
+# ---------------------------------------------------------------------------- #
+_C.MODEL.RESNETS = CN()
+_C.MODEL.RESNETS.DEPTH = 50
+_C.MODEL.RESNETS.OUT_FEATURES = ["res4"]  # res4 for C4 backbone, res2..5 for FPN backbone
+# Number of groups to use; 1 ==> ResNet; > 1 ==> ResNeXt
+_C.MODEL.RESNETS.NUM_GROUPS = 1
+# Options: FrozenBN, GN, "SyncBN", "BN"
+_C.MODEL.RESNETS.NORM = "FrozenBN"
+# Baseline width of each group.
+# Scaling this parameters will scale the width of all bottleneck layers.
+_C.MODEL.RESNETS.WIDTH_PER_GROUP = 64
+# Place the stride 2 conv on the 1x1 filter
+# Use True only for the original MSRA ResNet; use False for C2 and Torch models
+_C.MODEL.RESNETS.STRIDE_IN_1X1 = True
+# Apply dilation in stage "res5"
+_C.MODEL.RESNETS.RES5_DILATION = 1
+# Output width of res2. Scaling this parameters will scale the width of all 1x1 convs in ResNet
+# For R18 and R34, this needs to be set to 64
+_C.MODEL.RESNETS.RES2_OUT_CHANNELS = 256
+_C.MODEL.RESNETS.STEM_OUT_CHANNELS = 64
+# Apply Deformable Convolution in stages
+# Specify if apply deform_conv on Res2, Res3, Res4, Res5
+_C.MODEL.RESNETS.DEFORM_ON_PER_STAGE = [False, False, False, False]
+# Use True to use modulated deform_conv (DeformableV2, https://arxiv.org/abs/1811.11168);
+# Use False for DeformableV1.
+_C.MODEL.RESNETS.DEFORM_MODULATED = False
+# Number of groups in deformable conv.
+_C.MODEL.RESNETS.DEFORM_NUM_GROUPS = 1
+# ---------------------------------------------------------------------------- #
+# Solver
+# ---------------------------------------------------------------------------- #
+_C.SOLVER = CN()
+# Options: WarmupMultiStepLR, WarmupCosineLR.
+# See detectron2/solver/build.py for definition.
+_C.SOLVER.LR_SCHEDULER_NAME = "WarmupMultiStepLR"
+_C.SOLVER.MAX_ITER = 40000
+_C.SOLVER.BASE_LR = 0.001
+_C.SOLVER.MOMENTUM = 0.9
+_C.SOLVER.NESTEROV = False
+_C.SOLVER.WEIGHT_DECAY = 0.0001
+# The weight decay that's applied to parameters of normalization layers
+# (typically the affine transformation)
+_C.SOLVER.WEIGHT_DECAY_NORM = 0.0
+_C.SOLVER.GAMMA = 0.1
+# The iteration number to decrease learning rate by GAMMA.
+_C.SOLVER.STEPS = (30000,)
+_C.SOLVER.WARMUP_FACTOR = 1.0 / 1000
+_C.SOLVER.WARMUP_ITERS = 1000
+_C.SOLVER.WARMUP_METHOD = "linear"
+# Save a checkpoint after every this number of iterations
+_C.SOLVER.CHECKPOINT_PERIOD = 5000
+# Number of images per batch across all machines. This is also the number
+# of training images per step (i.e. per iteration). If we use 16 GPUs
+# and IMS_PER_BATCH = 32, each GPU will see 2 images per batch.
+# May be adjusted automatically if REFERENCE_WORLD_SIZE is set.
+_C.SOLVER.IMS_PER_BATCH = 16
+# The reference number of workers (GPUs) this config is meant to train with.
+# It takes no effect when set to 0.
+# With a non-zero value, it will be used by DefaultTrainer to compute a desired
+# per-worker batch size, and then scale the other related configs (total batch size,
+# learning rate, etc) to match the per-worker batch size.
+# See documentation of `DefaultTrainer.auto_scale_workers` for details:
+_C.SOLVER.REFERENCE_WORLD_SIZE = 0
+# Detectron v1 (and previous detection code) used a 2x higher LR and 0 WD for
+# biases. This is not useful (at least for recent models). You should avoid
+# changing these and they exist only to reproduce Detectron v1 training if
+# desired.
+_C.SOLVER.BIAS_LR_FACTOR = 1.0
+_C.SOLVER.WEIGHT_DECAY_BIAS = None  # None means following WEIGHT_DECAY
+# Gradient clipping
+_C.SOLVER.CLIP_GRADIENTS = CN({"ENABLED": False})
+# Type of gradient clipping, currently 2 values are supported:
+# - "value": the absolute values of elements of each gradients are clipped
+# - "norm": the norm of the gradient for each parameter is clipped thus
+#   affecting all elements in the parameter
+_C.SOLVER.CLIP_GRADIENTS.CLIP_TYPE = "value"
+# Maximum absolute value used for clipping gradients
+_C.SOLVER.CLIP_GRADIENTS.CLIP_VALUE = 1.0
+# Floating point number p for L-p norm to be used with the "norm"
+# gradient clipping type; for L-inf, please specify .inf
+_C.SOLVER.CLIP_GRADIENTS.NORM_TYPE = 2.0
+# Enable automatic mixed precision for training
+# Note that this does not change model's inference behavior.
+# To use AMP in inference, run inference under autocast()
+_C.SOLVER.AMP = CN({"ENABLED": False})
+# ---------------------------------------------------------------------------- #
+# Specific test options
+# ---------------------------------------------------------------------------- #
+_C.TEST = CN()
+# For end-to-end tests to verify the expected accuracy.
+# Each item is [task, metric, value, tolerance]
+# e.g.: [['bbox', 'AP', 38.5, 0.2]]
+_C.TEST.EXPECTED_RESULTS = []
+# The period (in terms of steps) to evaluate the model during training.
+# Set to 0 to disable.
+_C.TEST.EVAL_PERIOD = 0
+# The sigmas used to calculate keypoint OKS. See http://cocodataset.org/#keypoints-eval
+# When empty, it will use the defaults in COCO.
+# Otherwise it should be a list[float] with the same length as ROI_KEYPOINT_HEAD.NUM_KEYPOINTS.
+_C.TEST.KEYPOINT_OKS_SIGMAS = []
+# Maximum number of detections to return per image during inference (100 is
+# based on the limit established for the COCO dataset).
+_C.TEST.DETECTIONS_PER_IMAGE = 100
+_C.TEST.AUG = CN({"ENABLED": False})
+_C.TEST.AUG.MIN_SIZES = (400, 500, 600, 700, 800, 900, 1000, 1100, 1200)
+_C.TEST.AUG.MAX_SIZE = 4000
+_C.TEST.AUG.FLIP = True
+_C.TEST.PRECISE_BN = CN({"ENABLED": False})
+_C.TEST.PRECISE_BN.NUM_ITER = 200
+# ---------------------------------------------------------------------------- #
+# Misc options
+# ---------------------------------------------------------------------------- #
+# Directory where output files are written
+_C.OUTPUT_DIR = "./output"
+# Set seed to negative to fully randomize everything.
+# Set seed to positive to use a fixed seed. Note that a fixed seed increases
+# reproducibility but does not guarantee fully deterministic behavior.
+# Disabling all parallelism further increases reproducibility.
+_C.SEED = -1
+# Benchmark different cudnn algorithms.
+# If input images have very different sizes, this option will have large overhead
+# for about 10k iterations. It usually hurts total time, but can benefit for certain models.
+# If input images have the same or similar sizes, benchmark is often helpful.
+_C.CUDNN_BENCHMARK = False
+# The period (in terms of steps) for minibatch visualization at train time.
+# Set to 0 to disable.
+_C.VIS_PERIOD = 0
+# global config is for quick hack purposes.
+# You can set them in command line or config files,
+# and access it with:
+#
+# from detectron2.config import global_cfg
+# print(global_cfg.HACK)
+#
+# Do not commit any configs into it.
+_C.GLOBAL = CN()
+_C.GLOBAL.HACK = 1.0
--- a/detectron2/config/instantiate.py
+++ b/detectron2/config/instantiate.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+import dataclasses
+import logging
+from collections import abc
+from typing import Any
+from detectron2.utils.registry import _convert_target_to_string, locate
+__all__ = ["dump_dataclass", "instantiate"]
+def dump_dataclass(obj: Any):
+    """
+    Dump a dataclass recursively into a dict that can be later instantiated.
+    Args:
+        obj: a dataclass object
+    Returns:
+        dict
+    """
+    assert dataclasses.is_dataclass(obj) and not isinstance(
+        obj, type
+    ), "dump_dataclass() requires an instance of a dataclass."
+    ret = {"_target_": _convert_target_to_string(type(obj))}
+    for f in dataclasses.fields(obj):
+        v = getattr(obj, f.name)
+        if dataclasses.is_dataclass(v):
+            v = dump_dataclass(v)
+        if isinstance(v, (list, tuple)):
+            v = [dump_dataclass(x) if dataclasses.is_dataclass(x) else x for x in v]
+        ret[f.name] = v
+    return ret
+def instantiate(cfg):
+    """
+    Recursively instantiate objects defined in dictionaries by
+    "_target_" and arguments.
+    Args:
+        cfg: a dict-like object with "_target_" that defines the caller, and
+            other keys that define the arguments
+    Returns:
+        object instantiated by cfg
+    """
+    from omegaconf import ListConfig
+    if isinstance(cfg, ListConfig):
+        lst = [instantiate(x) for x in cfg]
+        return ListConfig(lst, flags={"allow_objects": True})
+    if isinstance(cfg, list):
+        # Specialize for list, because many classes take
+        # list[objects] as arguments, such as ResNet, DatasetMapper
+        return [instantiate(x) for x in cfg]
+    if isinstance(cfg, abc.Mapping) and "_target_" in cfg:
+        # conceptually equivalent to hydra.utils.instantiate(cfg) with _convert_=all,
+        # but faster: https://github.com/facebookresearch/hydra/issues/1200
+        cfg = {k: instantiate(v) for k, v in cfg.items()}
+        cls = cfg.pop("_target_")
+        cls = instantiate(cls)
+        if isinstance(cls, str):
+            cls_name = cls
+            cls = locate(cls_name)
+            assert cls is not None, cls_name
+        else:
+            try:
+                cls_name = cls.__module__ + "." + cls.__qualname__
+            except Exception:
+                # target could be anything, so the above could fail
+                cls_name = str(cls)
+        assert callable(cls), f"_target_ {cls} does not define a callable object"
+        try:
+            return cls(**cfg)
+        except TypeError:
+            logger = logging.getLogger(__name__)
+            logger.error(f"Error when instantiating {cls_name}!")
+            raise
+    return cfg  # return as-is if don't know what to do
--- a/detectron2/config/lazy.py
+++ b/detectron2/config/lazy.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+import ast
+import builtins
+import importlib
+import inspect
+import logging
+import os
+import uuid
+from collections import abc
+from contextlib import contextmanager
+from copy import deepcopy
+from dataclasses import is_dataclass
+from typing import List, Tuple, Union
+import cloudpickle
+import yaml
+from omegaconf import DictConfig, ListConfig, OmegaConf
+from detectron2.utils.file_io import PathManager
+from detectron2.utils.registry import _convert_target_to_string
+__all__ = ["LazyCall", "LazyConfig"]
+class LazyCall:
+    """
+    Wrap a callable so that when it's called, the call will not be executed,
+    but returns a dict that describes the call.
+    LazyCall object has to be called with only keyword arguments. Positional
+    arguments are not yet supported.
+    Examples:
+    ::
+        from detectron2.config import instantiate, LazyCall
+        layer_cfg = LazyCall(nn.Conv2d)(in_channels=32, out_channels=32)
+        layer_cfg.out_channels = 64   # can edit it afterwards
+        layer = instantiate(layer_cfg)
+    """
+    def __init__(self, target):
+        if not (callable(target) or isinstance(target, (str, abc.Mapping))):
+            raise TypeError(
+                f"target of LazyCall must be a callable or defines a callable! Got {target}"
+            )
+        self._target = target
+    def __call__(self, **kwargs):
+        if is_dataclass(self._target):
+            # omegaconf object cannot hold dataclass type
+            # https://github.com/omry/omegaconf/issues/784
+            target = _convert_target_to_string(self._target)
+        else:
+            target = self._target
+        kwargs["_target_"] = target
+        return DictConfig(content=kwargs, flags={"allow_objects": True})
+def _visit_dict_config(cfg, func):
+    """
+    Apply func recursively to all DictConfig in cfg.
+    """
+    if isinstance(cfg, DictConfig):
+        func(cfg)
+        for v in cfg.values():
+            _visit_dict_config(v, func)
+    elif isinstance(cfg, ListConfig):
+        for v in cfg:
+            _visit_dict_config(v, func)
+def _validate_py_syntax(filename):
+    # see also https://github.com/open-mmlab/mmcv/blob/master/mmcv/utils/config.py
+    with PathManager.open(filename, "r") as f:
+        content = f.read()
+    try:
+        ast.parse(content)
+    except SyntaxError as e:
+        raise SyntaxError(f"Config file {filename} has syntax error!") from e
+def _cast_to_config(obj):
+    # if given a dict, return DictConfig instead
+    if isinstance(obj, dict):
+        return DictConfig(obj, flags={"allow_objects": True})
+    return obj
+_CFG_PACKAGE_NAME = "detectron2._cfg_loader"
+"""
+A namespace to put all imported config into.
+"""
+def _random_package_name(filename):
+    # generate a random package name when loading config files
+    return _CFG_PACKAGE_NAME + str(uuid.uuid4())[:4] + "." + os.path.basename(filename)
+@contextmanager
+def _patch_import():
+    """
+    Enhance relative import statements in config files, so that they:
+    1. locate files purely based on relative location, regardless of packages.
+       e.g. you can import file without having __init__
+    2. do not cache modules globally; modifications of module states has no side effect
+    3. support other storage system through PathManager
+    4. imported dict are turned into omegaconf.DictConfig automatically
+    """
+    old_import = builtins.__import__
+    def find_relative_file(original_file, relative_import_path, level):
+        cur_file = os.path.dirname(original_file)
+        for _ in range(level - 1):
+            cur_file = os.path.dirname(cur_file)
+        cur_name = relative_import_path.lstrip(".")
+        for part in cur_name.split("."):
+            cur_file = os.path.join(cur_file, part)
+        # NOTE: directory import is not handled. Because then it's unclear
+        # if such import should produce python module or DictConfig. This can
+        # be discussed further if needed.
+        if not cur_file.endswith(".py"):
+            cur_file += ".py"
+        if not PathManager.isfile(cur_file):
+            raise ImportError(
+                f"Cannot import name {relative_import_path} from "
+                f"{original_file}: {cur_file} has to exist."
+            )
+        return cur_file
+    def new_import(name, globals=None, locals=None, fromlist=(), level=0):
+        if (
+            # Only deal with relative imports inside config files
+            level != 0
+            and globals is not None
+            and (globals.get("__package__", "") or "").startswith(_CFG_PACKAGE_NAME)
+        ):
+            cur_file = find_relative_file(globals["__file__"], name, level)
+            _validate_py_syntax(cur_file)
+            spec = importlib.machinery.ModuleSpec(
+                _random_package_name(cur_file), None, origin=cur_file
+            )
+            module = importlib.util.module_from_spec(spec)
+            module.__file__ = cur_file
+            with PathManager.open(cur_file) as f:
+                content = f.read()
+            exec(compile(content, cur_file, "exec"), module.__dict__)
+            for name in fromlist:  # turn imported dict into DictConfig automatically
+                val = _cast_to_config(module.__dict__[name])
+                module.__dict__[name] = val
+            return module
+        return old_import(name, globals, locals, fromlist=fromlist, level=level)
+    builtins.__import__ = new_import
+    yield new_import
+    builtins.__import__ = old_import
+class LazyConfig:
+    """
+    Provide methods to save, load, and overrides an omegaconf config object
+    which may contain definition of lazily-constructed objects.
+    """
+    @staticmethod
+    def load_rel(filename: str, keys: Union[None, str, Tuple[str, ...]] = None):
+        """
+        Similar to :meth:`load()`, but load path relative to the caller's
+        source file.
+        This has the same functionality as a relative import, except that this method
+        accepts filename as a string, so more characters are allowed in the filename.
+        """
+        caller_frame = inspect.stack()[1]
+        caller_fname = caller_frame[0].f_code.co_filename
+        assert caller_fname != "<string>", "load_rel Unable to find caller"
+        caller_dir = os.path.dirname(caller_fname)
+        filename = os.path.join(caller_dir, filename)
+        return LazyConfig.load(filename, keys)
+    @staticmethod
+    def load(filename: str, keys: Union[None, str, Tuple[str, ...]] = None):
+        """
+        Load a config file.
+        Args:
+            filename: absolute path or relative path w.r.t. the current working directory
+            keys: keys to load and return. If not given, return all keys
+                (whose values are config objects) in a dict.
+        """
+        has_keys = keys is not None
+        filename = filename.replace("/./", "/")  # redundant
+        if os.path.splitext(filename)[1] not in [".py", ".yaml", ".yml"]:
+            raise ValueError(f"Config file {filename} has to be a python or yaml file.")
+        if filename.endswith(".py"):
+            _validate_py_syntax(filename)
+            with _patch_import():
+                # Record the filename
+                module_namespace = {
+                    "__file__": filename,
+                    "__package__": _random_package_name(filename),
+                }
+                with PathManager.open(filename) as f:
+                    content = f.read()
+                # Compile first with filename to:
+                # 1. make filename appears in stacktrace
+                # 2. make load_rel able to find its parent's (possibly remote) location
+                exec(compile(content, filename, "exec"), module_namespace)
+            ret = module_namespace
+        else:
+            with PathManager.open(filename) as f:
+                obj = yaml.unsafe_load(f)
+            ret = OmegaConf.create(obj, flags={"allow_objects": True})
+        if has_keys:
+            if isinstance(keys, str):
+                return _cast_to_config(ret[keys])
+            else:
+                return tuple(_cast_to_config(ret[a]) for a in keys)
+        else:
+            if filename.endswith(".py"):
+                # when not specified, only load those that are config objects
+                ret = DictConfig(
+                    {
+                        name: _cast_to_config(value)
+                        for name, value in ret.items()
+                        if isinstance(value, (DictConfig, ListConfig, dict))
+                        and not name.startswith("_")
+                    },
+                    flags={"allow_objects": True},
+                )
+            return ret
+    @staticmethod
+    def save(cfg, filename: str):
+        """
+        Save a config object to a yaml file.
+        Note that when the config dictionary contains complex objects (e.g. lambda),
+        it can't be saved to yaml. In that case we will print an error and
+        attempt to save to a pkl file instead.
+        Args:
+            cfg: an omegaconf config object
+            filename: yaml file name to save the config file
+        """
+        logger = logging.getLogger(__name__)
+        try:
+            cfg = deepcopy(cfg)
+        except Exception:
+            pass
+        else:
+            # if it's deep-copyable, then...
+            def _replace_type_by_name(x):
+                if "_target_" in x and callable(x._target_):
+                    try:
+                        x._target_ = _convert_target_to_string(x._target_)
+                    except AttributeError:
+                        pass
+            # not necessary, but makes yaml looks nicer
+            _visit_dict_config(cfg, _replace_type_by_name)
+        save_pkl = False
+        try:
+            dict = OmegaConf.to_container(cfg, resolve=False)
+            dumped = yaml.dump(dict, default_flow_style=None, allow_unicode=True, width=9999)
+            with PathManager.open(filename, "w") as f:
+                f.write(dumped)
+            try:
+                _ = yaml.unsafe_load(dumped)  # test that it is loadable
+            except Exception:
+                logger.warning(
+                    "The config contains objects that cannot serialize to a valid yaml. "
+                    f"{filename} is human-readable but cannot be loaded."
+                )
+                save_pkl = True
+        except Exception:
+            logger.exception("Unable to serialize the config to yaml. Error:")
+            save_pkl = True
+        if save_pkl:
+            new_filename = filename + ".pkl"
+            try:
+                # retry by pickle
+                with PathManager.open(new_filename, "wb") as f:
+                    cloudpickle.dump(cfg, f)
+                logger.warning(f"Config is saved using cloudpickle at {new_filename}.")
+            except Exception:
+                pass
+    @staticmethod
+    def apply_overrides(cfg, overrides: List[str]):
+        """
+        In-place override contents of cfg.
+        Args:
+            cfg: an omegaconf config object
+            overrides: list of strings in the format of "a=b" to override configs.
+                See https://hydra.cc/docs/next/advanced/override_grammar/basic/
+                for syntax.
+        Returns:
+            the cfg object
+        """
+        def safe_update(cfg, key, value):
+            parts = key.split(".")
+            for idx in range(1, len(parts)):
+                prefix = ".".join(parts[:idx])
+                v = OmegaConf.select(cfg, prefix, default=None)
+                if v is None:
+                    break
+                if not OmegaConf.is_config(v):
+                    raise KeyError(
+                        f"Trying to update key {key}, but {prefix} "
+                        f"is not a config, but has type {type(v)}."
+                    )
+            OmegaConf.update(cfg, key, value, merge=True)
+        from hydra.core.override_parser.overrides_parser import OverridesParser
+        parser = OverridesParser.create()
+        overrides = parser.parse_overrides(overrides)
+        for o in overrides:
+            key = o.key_or_group
+            value = o.value()
+            if o.is_delete():
+                # TODO support this
+                raise NotImplementedError("deletion is not yet a supported override")
+            safe_update(cfg, key, value)
+        return cfg
+    @staticmethod
+    def to_py(cfg, prefix: str = "cfg."):
+        """
+        Try to convert a config object into Python-like psuedo code.
+        Note that perfect conversion is not always possible. So the returned
+        results are mainly meant to be human-readable, and not meant to be executed.
+        Args:
+            cfg: an omegaconf config object
+            prefix: root name for the resulting code (default: "cfg.")
+        Returns:
+            str of formatted Python code
+        """
+        import black
+        cfg = OmegaConf.to_container(cfg, resolve=True)
+        def _to_str(obj, prefix=None, inside_call=False):
+            if prefix is None:
+                prefix = []
+            if isinstance(obj, abc.Mapping) and "_target_" in obj:
+                # Dict representing a function call
+                target = _convert_target_to_string(obj.pop("_target_"))
+                args = []
+                for k, v in sorted(obj.items()):
+                    args.append(f"{k}={_to_str(v, inside_call=True)}")
+                args = ", ".join(args)
+                call = f"{target}({args})"
+                return "".join(prefix) + call
+            elif isinstance(obj, abc.Mapping) and not inside_call:
+                # Dict that is not inside a call is a list of top-level config objects that we
+                # render as one object per line with dot separated prefixes
+                key_list = []
+                for k, v in sorted(obj.items()):
+                    if isinstance(v, abc.Mapping) and "_target_" not in v:
+                        key_list.append(_to_str(v, prefix=prefix + [k + "."]))
+                    else:
+                        key = "".join(prefix) + k
+                        key_list.append(f"{key}={_to_str(v)}")
+                return "\n".join(key_list)
+            elif isinstance(obj, abc.Mapping):
+                # Dict that is inside a call is rendered as a regular dict
+                return (
+                    "{"
+                    + ",".join(
+                        f"{repr(k)}: {_to_str(v, inside_call=inside_call)}"
+                        for k, v in sorted(obj.items())
+                    )
+                    + "}"
+                )
+            elif isinstance(obj, list):
+                return "[" + ",".join(_to_str(x, inside_call=inside_call) for x in obj) + "]"
+            else:
+                return repr(obj)
+        py_str = _to_str(cfg, prefix=[prefix])
+        try:
+            return black.format_str(py_str, mode=black.Mode())
+        except black.InvalidInput:
+            return py_str
--- a/detectron2/data/__init__.py
+++ b/detectron2/data/__init__.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+from . import transforms  # isort:skip
+from .build import (
+    build_batch_data_loader,
+    build_detection_test_loader,
+    build_detection_train_loader,
+    get_detection_dataset_dicts,
+    load_proposals_into_dataset,
+    print_instances_class_histogram,
+)
+from .catalog import DatasetCatalog, MetadataCatalog, Metadata
+from .common import DatasetFromList, MapDataset, ToIterableDataset
+from .dataset_mapper import DatasetMapper
+# ensure the builtin datasets are registered
+from . import datasets, samplers  # isort:skip
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
--- a/detectron2/data/benchmark.py
+++ b/detectron2/data/benchmark.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+import logging
+import numpy as np
+from itertools import count
+from typing import List, Tuple
+import torch
+import tqdm
+from fvcore.common.timer import Timer
+from detectron2.utils import comm
+from .build import build_batch_data_loader
+from .common import DatasetFromList, MapDataset
+from .samplers import TrainingSampler
+logger = logging.getLogger(__name__)
+class _EmptyMapDataset(torch.utils.data.Dataset):
+    """
+    Map anything to emptiness.
+    """
+    def __init__(self, dataset):
+        self.ds = dataset
+    def __len__(self):
+        return len(self.ds)
+    def __getitem__(self, idx):
+        _ = self.ds[idx]
+        return [0]
+def iter_benchmark(
+    iterator, num_iter: int, warmup: int = 5, max_time_seconds: float = 60
+) -> Tuple[float, List[float]]:
+    """
+    Benchmark an iterator/iterable for `num_iter` iterations with an extra
+    `warmup` iterations of warmup.
+    End early if `max_time_seconds` time is spent on iterations.
+    Returns:
+        float: average time (seconds) per iteration
+        list[float]: time spent on each iteration. Sometimes useful for further analysis.
+    """
+    num_iter, warmup = int(num_iter), int(warmup)
+    iterator = iter(iterator)
+    for _ in range(warmup):
+        next(iterator)
+    timer = Timer()
+    all_times = []
+    for curr_iter in tqdm.trange(num_iter):
+        start = timer.seconds()
+        if start > max_time_seconds:
+            num_iter = curr_iter
+            break
+        next(iterator)
+        all_times.append(timer.seconds() - start)
+    avg = timer.seconds() / num_iter
+    return avg, all_times
+class DataLoaderBenchmark:
+    """
+    Some common benchmarks that help understand perf bottleneck of a standard dataloader
+    made of dataset, mapper and sampler.
+    """
+    def __init__(
+        self,
+        dataset,
+        *,
+        mapper,
+        sampler=None,
+        total_batch_size,
+        num_workers=0,
+        max_time_seconds: int = 90,
+    ):
+        """
+        Args:
+            max_time_seconds (int): maximum time to spent for each benchmark
+            other args: same as in `build.py:build_detection_train_loader`
+        """
+        if isinstance(dataset, list):
+            dataset = DatasetFromList(dataset, copy=False, serialize=True)
+        if sampler is None:
+            sampler = TrainingSampler(len(dataset))
+        self.dataset = dataset
+        self.mapper = mapper
+        self.sampler = sampler
+        self.total_batch_size = total_batch_size
+        self.num_workers = num_workers
+        self.per_gpu_batch_size = self.total_batch_size // comm.get_world_size()
+        self.max_time_seconds = max_time_seconds
+    def _benchmark(self, iterator, num_iter, warmup, msg=None):
+        avg, all_times = iter_benchmark(iterator, num_iter, warmup, self.max_time_seconds)
+        if msg is not None:
+            self._log_time(msg, avg, all_times)
+        return avg, all_times
+    def _log_time(self, msg, avg, all_times, distributed=False):
+        percentiles = [np.percentile(all_times, k, interpolation="nearest") for k in [1, 5, 95, 99]]
+        if not distributed:
+            logger.info(
+                f"{msg}: avg={1.0/avg:.1f} it/s, "
+                f"p1={percentiles[0]:.2g}s, p5={percentiles[1]:.2g}s, "
+                f"p95={percentiles[2]:.2g}s, p99={percentiles[3]:.2g}s."
+            )
+            return
+        avg_per_gpu = comm.all_gather(avg)
+        percentiles_per_gpu = comm.all_gather(percentiles)
+        if comm.get_rank() > 0:
+            return
+        for idx, avg, percentiles in zip(count(), avg_per_gpu, percentiles_per_gpu):
+            logger.info(
+                f"GPU{idx} {msg}: avg={1.0/avg:.1f} it/s, "
+                f"p1={percentiles[0]:.2g}s, p5={percentiles[1]:.2g}s, "
+                f"p95={percentiles[2]:.2g}s, p99={percentiles[3]:.2g}s."
+            )
+    def benchmark_dataset(self, num_iter, warmup=5):
+        """
+        Benchmark the speed of taking raw samples from the dataset.
+        """
+        def loader():
+            while True:
+                for k in self.sampler:
+                    yield self.dataset[k]
+        self._benchmark(loader(), num_iter, warmup, "Dataset Alone")
+    def benchmark_mapper(self, num_iter, warmup=5):
+        """
+        Benchmark the speed of taking raw samples from the dataset and map
+        them in a single process.
+        """
+        def loader():
+            while True:
+                for k in self.sampler:
+                    yield self.mapper(self.dataset[k])
+        self._benchmark(loader(), num_iter, warmup, "Single Process Mapper (sec/sample)")
+    def benchmark_workers(self, num_iter, warmup=10):
+        """
+        Benchmark the dataloader by tuning num_workers to [0, 1, self.num_workers].
+        """
+        candidates = [0, 1]
+        if self.num_workers not in candidates:
+            candidates.append(self.num_workers)
+        dataset = MapDataset(self.dataset, self.mapper)
+        for n in candidates:
+            loader = build_batch_data_loader(
+                dataset,
+                self.sampler,
+                self.total_batch_size,
+                num_workers=n,
+            )
+            self._benchmark(
+                iter(loader),
+                num_iter * max(n, 1),
+                warmup * max(n, 1),
+                f"DataLoader ({n} workers, bs={self.per_gpu_batch_size})",
+            )
+            del loader
+    def benchmark_IPC(self, num_iter, warmup=10):
+        """
+        Benchmark the dataloader where each worker outputs nothing. This
+        eliminates the IPC overhead compared to the regular dataloader.
+        PyTorch multiprocessing's IPC only optimizes for torch tensors.
+        Large numpy arrays or other data structure may incur large IPC overhead.
+        """
+        n = self.num_workers
+        dataset = _EmptyMapDataset(MapDataset(self.dataset, self.mapper))
+        loader = build_batch_data_loader(
+            dataset, self.sampler, self.total_batch_size, num_workers=n
+        )
+        self._benchmark(
+            iter(loader),
+            num_iter * max(n, 1),
+            warmup * max(n, 1),
+            f"DataLoader ({n} workers, bs={self.per_gpu_batch_size}) w/o comm",
+        )
+    def benchmark_distributed(self, num_iter, warmup=10):
+        """
+        Benchmark the dataloader in each distributed worker, and log results of
+        all workers. This helps understand the final performance as well as
+        the variances among workers.
+        It also prints startup time (first iter) of the dataloader.
+        """
+        gpu = comm.get_world_size()
+        dataset = MapDataset(self.dataset, self.mapper)
+        n = self.num_workers
+        loader = build_batch_data_loader(
+            dataset, self.sampler, self.total_batch_size, num_workers=n
+        )
+        timer = Timer()
+        loader = iter(loader)
+        next(loader)
+        startup_time = timer.seconds()
+        logger.info("Dataloader startup time: {:.2f} seconds".format(startup_time))
+        comm.synchronize()
+        avg, all_times = self._benchmark(loader, num_iter * max(n, 1), warmup * max(n, 1))
+        del loader
+        self._log_time(
+            f"DataLoader ({gpu} GPUs x {n} workers, total bs={self.total_batch_size})",
+            avg,
+            all_times,
+            True,
+        )
--- a/detectron2/data/build.py
+++ b/detectron2/data/build.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+import itertools
+import logging
+import numpy as np
+import operator
+import pickle
+import torch
+import torch.utils.data as torchdata
+from tabulate import tabulate
+from termcolor import colored
+from detectron2.config import configurable
+from detectron2.structures import BoxMode
+from detectron2.utils.comm import get_world_size
+from detectron2.utils.env import seed_all_rng
+from detectron2.utils.file_io import PathManager
+from detectron2.utils.logger import _log_api_usage, log_first_n
+from .catalog import DatasetCatalog, MetadataCatalog
+from .common import AspectRatioGroupedDataset, DatasetFromList, MapDataset, ToIterableDataset
+from .dataset_mapper import DatasetMapper
+from .detection_utils import check_metadata_consistency
+from .samplers import (
+    InferenceSampler,
+    RandomSubsetTrainingSampler,
+    RepeatFactorTrainingSampler,
+    TrainingSampler,
+)
+"""
+This file contains the default logic to build a dataloader for training or testing.
+"""
+__all__ = [
+    "build_batch_data_loader",
+    "build_detection_train_loader",
+    "build_detection_test_loader",
+    "get_detection_dataset_dicts",
+    "load_proposals_into_dataset",
+    "print_instances_class_histogram",
+]
+def filter_images_with_only_crowd_annotations(dataset_dicts):
+    """
+    Filter out images with none annotations or only crowd annotations
+    (i.e., images without non-crowd annotations).
+    A common training-time preprocessing on COCO dataset.
+    Args:
+        dataset_dicts (list[dict]): annotations in Detectron2 Dataset format.
+    Returns:
+        list[dict]: the same format, but filtered.
+    """
+    num_before = len(dataset_dicts)
+    def valid(anns):
+        for ann in anns:
+            if ann.get("iscrowd", 0) == 0:
+                return True
+        return False
+    dataset_dicts = [x for x in dataset_dicts if valid(x["annotations"])]
+    num_after = len(dataset_dicts)
+    logger = logging.getLogger(__name__)
+    logger.info(
+        "Removed {} images with no usable annotations. {} images left.".format(
+            num_before - num_after, num_after
+        )
+    )
+    return dataset_dicts
+def filter_images_with_few_keypoints(dataset_dicts, min_keypoints_per_image):
+    """
+    Filter out images with too few number of keypoints.
+    Args:
+        dataset_dicts (list[dict]): annotations in Detectron2 Dataset format.
+    Returns:
+        list[dict]: the same format as dataset_dicts, but filtered.
+    """
+    num_before = len(dataset_dicts)
+    def visible_keypoints_in_image(dic):
+        # Each keypoints field has the format [x1, y1, v1, ...], where v is visibility
+        annotations = dic["annotations"]
+        return sum(
+            (np.array(ann["keypoints"][2::3]) > 0).sum()
+            for ann in annotations
+            if "keypoints" in ann
+        )
+    dataset_dicts = [
+        x for x in dataset_dicts if visible_keypoints_in_image(x) >= min_keypoints_per_image
+    ]
+    num_after = len(dataset_dicts)
+    logger = logging.getLogger(__name__)
+    logger.info(
+        "Removed {} images with fewer than {} keypoints.".format(
+            num_before - num_after, min_keypoints_per_image
+        )
+    )
+    return dataset_dicts
+def load_proposals_into_dataset(dataset_dicts, proposal_file):
+    """
+    Load precomputed object proposals into the dataset.
+    The proposal file should be a pickled dict with the following keys:
+    - "ids": list[int] or list[str], the image ids
+    - "boxes": list[np.ndarray], each is an Nx4 array of boxes corresponding to the image id
+    - "objectness_logits": list[np.ndarray], each is an N sized array of objectness scores
+      corresponding to the boxes.
+    - "bbox_mode": the BoxMode of the boxes array. Defaults to ``BoxMode.XYXY_ABS``.
+    Args:
+        dataset_dicts (list[dict]): annotations in Detectron2 Dataset format.
+        proposal_file (str): file path of pre-computed proposals, in pkl format.
+    Returns:
+        list[dict]: the same format as dataset_dicts, but added proposal field.
+    """
+    logger = logging.getLogger(__name__)
+    logger.info("Loading proposals from: {}".format(proposal_file))
+    with PathManager.open(proposal_file, "rb") as f:
+        proposals = pickle.load(f, encoding="latin1")
+    # Rename the key names in D1 proposal files
+    rename_keys = {"indexes": "ids", "scores": "objectness_logits"}
+    for key in rename_keys:
+        if key in proposals:
+            proposals[rename_keys[key]] = proposals.pop(key)
+    # Fetch the indexes of all proposals that are in the dataset
+    # Convert image_id to str since they could be int.
+    img_ids = set({str(record["image_id"]) for record in dataset_dicts})
+    id_to_index = {str(id): i for i, id in enumerate(proposals["ids"]) if str(id) in img_ids}
+    # Assuming default bbox_mode of precomputed proposals are 'XYXY_ABS'
+    bbox_mode = BoxMode(proposals["bbox_mode"]) if "bbox_mode" in proposals else BoxMode.XYXY_ABS
+    for record in dataset_dicts:
+        # Get the index of the proposal
+        i = id_to_index[str(record["image_id"])]
+        boxes = proposals["boxes"][i]
+        objectness_logits = proposals["objectness_logits"][i]
+        # Sort the proposals in descending order of the scores
+        inds = objectness_logits.argsort()[::-1]
+        record["proposal_boxes"] = boxes[inds]
+        record["proposal_objectness_logits"] = objectness_logits[inds]
+        record["proposal_bbox_mode"] = bbox_mode
+    return dataset_dicts
+def print_instances_class_histogram(dataset_dicts, class_names):
+    """
+    Args:
+        dataset_dicts (list[dict]): list of dataset dicts.
+        class_names (list[str]): list of class names (zero-indexed).
+    """
+    num_classes = len(class_names)
+    hist_bins = np.arange(num_classes + 1)
+    histogram = np.zeros((num_classes,), dtype=np.int)
+    for entry in dataset_dicts:
+        annos = entry["annotations"]
+        classes = np.asarray(
+            [x["category_id"] for x in annos if not x.get("iscrowd", 0)], dtype=np.int
+        )
+        if len(classes):
+            assert classes.min() >= 0, f"Got an invalid category_id={classes.min()}"
+            assert (
+                classes.max() < num_classes
+            ), f"Got an invalid category_id={classes.max()} for a dataset of {num_classes} classes"
+        histogram += np.histogram(classes, bins=hist_bins)[0]
+    N_COLS = min(6, len(class_names) * 2)
+    def short_name(x):
+        # make long class names shorter. useful for lvis
+        if len(x) > 13:
+            return x[:11] + ".."
+        return x
+    data = list(
+        itertools.chain(*[[short_name(class_names[i]), int(v)] for i, v in enumerate(histogram)])
+    )
+    total_num_instances = sum(data[1::2])
+    data.extend([None] * (N_COLS - (len(data) % N_COLS)))
+    if num_classes > 1:
+        data.extend(["total", total_num_instances])
+    data = itertools.zip_longest(*[data[i::N_COLS] for i in range(N_COLS)])
+    table = tabulate(
+        data,
+        headers=["category", "#instances"] * (N_COLS // 2),
+        tablefmt="pipe",
+        numalign="left",
+        stralign="center",
+    )
+    log_first_n(
+        logging.INFO,
+        "Distribution of instances among all {} categories:\n".format(num_classes)
+        + colored(table, "cyan"),
+        key="message",
+    )
+def get_detection_dataset_dicts(
+    names,
+    filter_empty=True,
+    min_keypoints=0,
+    proposal_files=None,
+    check_consistency=True,
+):
+    """
+    Load and prepare dataset dicts for instance detection/segmentation and semantic segmentation.
+    Args:
+        names (str or list[str]): a dataset name or a list of dataset names
+        filter_empty (bool): whether to filter out images without instance annotations
+        min_keypoints (int): filter out images with fewer keypoints than
+            `min_keypoints`. Set to 0 to do nothing.
+        proposal_files (list[str]): if given, a list of object proposal files
+            that match each dataset in `names`.
+        check_consistency (bool): whether to check if datasets have consistent metadata.
+    Returns:
+        list[dict]: a list of dicts following the standard dataset dict format.
+    """
+    if isinstance(names, str):
+        names = [names]
+    assert len(names), names
+    dataset_dicts = [DatasetCatalog.get(dataset_name) for dataset_name in names]
+    for dataset_name, dicts in zip(names, dataset_dicts):
+        assert len(dicts), "Dataset '{}' is empty!".format(dataset_name)
+    if proposal_files is not None:
+        assert len(names) == len(proposal_files)
+        # load precomputed proposals from proposal files
+        dataset_dicts = [
+            load_proposals_into_dataset(dataset_i_dicts, proposal_file)
+            for dataset_i_dicts, proposal_file in zip(dataset_dicts, proposal_files)
+        ]
+    if isinstance(dataset_dicts[0], torchdata.Dataset):
+        return torchdata.ConcatDataset(dataset_dicts)
+    dataset_dicts = list(itertools.chain.from_iterable(dataset_dicts))
+    has_instances = "annotations" in dataset_dicts[0]
+    if filter_empty and has_instances:
+        dataset_dicts = filter_images_with_only_crowd_annotations(dataset_dicts)
+    if min_keypoints > 0 and has_instances:
+        dataset_dicts = filter_images_with_few_keypoints(dataset_dicts, min_keypoints)
+    if check_consistency and has_instances:
+        try:
+            class_names = MetadataCatalog.get(names[0]).thing_classes
+            check_metadata_consistency("thing_classes", names)
+            print_instances_class_histogram(dataset_dicts, class_names)
+        except AttributeError:  # class names are not available for this dataset
+            pass
+    assert len(dataset_dicts), "No valid data found in {}.".format(",".join(names))
+    return dataset_dicts
+def build_batch_data_loader(
+    dataset,
+    sampler,
+    total_batch_size,
+    *,
+    aspect_ratio_grouping=False,
+    num_workers=0,
+    collate_fn=None,
+):
+    """
+    Build a batched dataloader. The main differences from `torch.utils.data.DataLoader` are:
+    1. support aspect ratio grouping options
+    2. use no "batch collation", because this is common for detection training
+    Args:
+        dataset (torch.utils.data.Dataset): a pytorch map-style or iterable dataset.
+        sampler (torch.utils.data.sampler.Sampler or None): a sampler that produces indices.
+            Must be provided iff. ``dataset`` is a map-style dataset.
+        total_batch_size, aspect_ratio_grouping, num_workers, collate_fn: see
+            :func:`build_detection_train_loader`.
+    Returns:
+        iterable[list]. Length of each list is the batch size of the current
+            GPU. Each element in the list comes from the dataset.
+    """
+    world_size = get_world_size()
+    assert (
+        total_batch_size > 0 and total_batch_size % world_size == 0
+    ), "Total batch size ({}) must be divisible by the number of gpus ({}).".format(
+        total_batch_size, world_size
+    )
+    batch_size = total_batch_size // world_size
+    if isinstance(dataset, torchdata.IterableDataset):
+        assert sampler is None, "sampler must be None if dataset is IterableDataset"
+    else:
+        dataset = ToIterableDataset(dataset, sampler)
+    if aspect_ratio_grouping:
+        data_loader = torchdata.DataLoader(
+            dataset,
+            num_workers=num_workers,
+            collate_fn=operator.itemgetter(0),  # don't batch, but yield individual elements
+            worker_init_fn=worker_init_reset_seed,
+        )  # yield individual mapped dict
+        data_loader = AspectRatioGroupedDataset(data_loader, batch_size)
+        if collate_fn is None:
+            return data_loader
+        return MapDataset(data_loader, collate_fn)
+    else:
+        return torchdata.DataLoader(
+            dataset,
+            batch_size=batch_size,
+            drop_last=True,
+            num_workers=num_workers,
+            collate_fn=trivial_batch_collator if collate_fn is None else collate_fn,
+            worker_init_fn=worker_init_reset_seed,
+        )
+def _train_loader_from_config(cfg, mapper=None, *, dataset=None, sampler=None):
+    if dataset is None:
+        dataset = get_detection_dataset_dicts(
+            cfg.DATASETS.TRAIN,
+            filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS,
+            min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE
+            if cfg.MODEL.KEYPOINT_ON
+            else 0,
+            proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None,
+        )
+        _log_api_usage("dataset." + cfg.DATASETS.TRAIN[0])
+    if mapper is None:
+        mapper = DatasetMapper(cfg, True)
+    if sampler is None:
+        sampler_name = cfg.DATALOADER.SAMPLER_TRAIN
+        logger = logging.getLogger(__name__)
+        logger.info("Using training sampler {}".format(sampler_name))
+        if sampler_name == "TrainingSampler":
+            sampler = TrainingSampler(len(dataset))
+        elif sampler_name == "RepeatFactorTrainingSampler":
+            repeat_factors = RepeatFactorTrainingSampler.repeat_factors_from_category_frequency(
+                dataset, cfg.DATALOADER.REPEAT_THRESHOLD
+            )
+            sampler = RepeatFactorTrainingSampler(repeat_factors)
+        elif sampler_name == "RandomSubsetTrainingSampler":
+            sampler = RandomSubsetTrainingSampler(len(dataset), cfg.DATALOADER.RANDOM_SUBSET_RATIO)
+        else:
+            raise ValueError("Unknown training sampler: {}".format(sampler_name))
+    return {
+        "dataset": dataset,
+        "sampler": sampler,
+        "mapper": mapper,
+        "total_batch_size": cfg.SOLVER.IMS_PER_BATCH,
+        "aspect_ratio_grouping": cfg.DATALOADER.ASPECT_RATIO_GROUPING,
+        "num_workers": cfg.DATALOADER.NUM_WORKERS,
+    }
+@configurable(from_config=_train_loader_from_config)
+def build_detection_train_loader(
+    dataset,
+    *,
+    mapper,
+    sampler=None,
+    total_batch_size,
+    aspect_ratio_grouping=True,
+    num_workers=0,
+    collate_fn=None,
+):
+    """
+    Build a dataloader for object detection with some default features.
+    This interface is experimental.
+    Args:
+        dataset (list or torch.utils.data.Dataset): a list of dataset dicts,
+            or a pytorch dataset (either map-style or iterable). It can be obtained
+            by using :func:`DatasetCatalog.get` or :func:`get_detection_dataset_dicts`.
+        mapper (callable): a callable which takes a sample (dict) from dataset and
+            returns the format to be consumed by the model.
+            When using cfg, the default choice is ``DatasetMapper(cfg, is_train=True)``.
+        sampler (torch.utils.data.sampler.Sampler or None): a sampler that produces
+            indices to be applied on ``dataset``.
+            If ``dataset`` is map-style, the default sampler is a :class:`TrainingSampler`,
+            which coordinates an infinite random shuffle sequence across all workers.
+            Sampler must be None if ``dataset`` is iterable.
+        total_batch_size (int): total batch size across all workers. Batching
+            simply puts data into a list.
+        aspect_ratio_grouping (bool): whether to group images with similar
+            aspect ratio for efficiency. When enabled, it requires each
+            element in dataset be a dict with keys "width" and "height".
+        num_workers (int): number of parallel data loading workers
+        collate_fn: same as the argument of `torch.utils.data.DataLoader`.
+            Defaults to do no collation and return a list of data.
+            No collation is OK for small batch size and simple data structures.
+            If your batch size is large and each sample contains too many small tensors,
+            it's more efficient to collate them in data loader.
+    Returns:
+        torch.utils.data.DataLoader:
+            a dataloader. Each output from it is a ``list[mapped_element]`` of length
+            ``total_batch_size / num_workers``, where ``mapped_element`` is produced
+            by the ``mapper``.
+    """
+    if isinstance(dataset, list):
+        dataset = DatasetFromList(dataset, copy=False)
+    if mapper is not None:
+        dataset = MapDataset(dataset, mapper)
+    if isinstance(dataset, torchdata.IterableDataset):
+        assert sampler is None, "sampler must be None if dataset is IterableDataset"
+    else:
+        if sampler is None:
+            sampler = TrainingSampler(len(dataset))
+        assert isinstance(sampler, torchdata.Sampler), f"Expect a Sampler but got {type(sampler)}"
+    return build_batch_data_loader(
+        dataset,
+        sampler,
+        total_batch_size,
+        aspect_ratio_grouping=aspect_ratio_grouping,
+        num_workers=num_workers,
+        collate_fn=collate_fn,
+    )
+def _test_loader_from_config(cfg, dataset_name, mapper=None):
+    """
+    Uses the given `dataset_name` argument (instead of the names in cfg), because the
+    standard practice is to evaluate each test set individually (not combining them).
+    """
+    if isinstance(dataset_name, str):
+        dataset_name = [dataset_name]
+    dataset = get_detection_dataset_dicts(
+        dataset_name,
+        filter_empty=False,
+        proposal_files=[
+            cfg.DATASETS.PROPOSAL_FILES_TEST[list(cfg.DATASETS.TEST).index(x)] for x in dataset_name
+        ]
+        if cfg.MODEL.LOAD_PROPOSALS
+        else None,
+    )
+    if mapper is None:
+        mapper = DatasetMapper(cfg, False)
+    return {"dataset": dataset, "mapper": mapper, "num_workers": cfg.DATALOADER.NUM_WORKERS}
+@configurable(from_config=_test_loader_from_config)
+def build_detection_test_loader(dataset, *, mapper, sampler=None, num_workers=0, collate_fn=None):
+    """
+    Similar to `build_detection_train_loader`, but uses a batch size of 1,
+    and :class:`InferenceSampler`. This sampler coordinates all workers to
+    produce the exact set of all samples.
+    This interface is experimental.
+    Args:
+        dataset (list or torch.utils.data.Dataset): a list of dataset dicts,
+            or a pytorch dataset (either map-style or iterable). They can be obtained
+            by using :func:`DatasetCatalog.get` or :func:`get_detection_dataset_dicts`.
+        mapper (callable): a callable which takes a sample (dict) from dataset
+           and returns the format to be consumed by the model.
+           When using cfg, the default choice is ``DatasetMapper(cfg, is_train=False)``.
+        sampler (torch.utils.data.sampler.Sampler or None): a sampler that produces
+            indices to be applied on ``dataset``. Default to :class:`InferenceSampler`,
+            which splits the dataset across all workers. Sampler must be None
+            if `dataset` is iterable.
+        num_workers (int): number of parallel data loading workers
+        collate_fn: same as the argument of `torch.utils.data.DataLoader`.
+            Defaults to do no collation and return a list of data.
+    Returns:
+        DataLoader: a torch DataLoader, that loads the given detection
+        dataset, with test-time transformation and batching.
+    Examples:
+    ::
+        data_loader = build_detection_test_loader(
+            DatasetRegistry.get("my_test"),
+            mapper=DatasetMapper(...))
+        # or, instantiate with a CfgNode:
+        data_loader = build_detection_test_loader(cfg, "my_test")
+    """
+    if isinstance(dataset, list):
+        dataset = DatasetFromList(dataset, copy=False)
+    if mapper is not None:
+        dataset = MapDataset(dataset, mapper)
+    if isinstance(dataset, torchdata.IterableDataset):
+        assert sampler is None, "sampler must be None if dataset is IterableDataset"
+    else:
+        if sampler is None:
+            sampler = InferenceSampler(len(dataset))
+    # Always use 1 image per worker during inference since this is the
+    # standard when reporting inference time in papers.
+    return torchdata.DataLoader(
+        dataset,
+        batch_size=1,
+        sampler=sampler,
+        num_workers=num_workers,
+        collate_fn=trivial_batch_collator if collate_fn is None else collate_fn,
+    )
+def trivial_batch_collator(batch):
+    """
+    A batch collator that does nothing.
+    """
+    return batch
+def worker_init_reset_seed(worker_id):
+    initial_seed = torch.initial_seed() % 2 ** 31
+    seed_all_rng(initial_seed + worker_id)
--- a/detectron2/data/catalog.py
+++ b/detectron2/data/catalog.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+import copy
+import logging
+import types
+from collections import UserDict
+from typing import List
+from detectron2.utils.logger import log_first_n
+__all__ = ["DatasetCatalog", "MetadataCatalog", "Metadata"]
+class _DatasetCatalog(UserDict):
+    """
+    A global dictionary that stores information about the datasets and how to obtain them.
+    It contains a mapping from strings
+    (which are names that identify a dataset, e.g. "coco_2014_train")
+    to a function which parses the dataset and returns the samples in the
+    format of `list[dict]`.
+    The returned dicts should be in Detectron2 Dataset format (See DATASETS.md for details)
+    if used with the data loader functionalities in `data/build.py,data/detection_transform.py`.
+    The purpose of having this catalog is to make it easy to choose
+    different datasets, by just using the strings in the config.
+    """
+    def register(self, name, func):
+        """
+        Args:
+            name (str): the name that identifies a dataset, e.g. "coco_2014_train".
+            func (callable): a callable which takes no arguments and returns a list of dicts.
+                It must return the same results if called multiple times.
+        """
+        assert callable(func), "You must register a function with `DatasetCatalog.register`!"
+        assert name not in self, "Dataset '{}' is already registered!".format(name)
+        self[name] = func
+    def get(self, name):
+        """
+        Call the registered function and return its results.
+        Args:
+            name (str): the name that identifies a dataset, e.g. "coco_2014_train".
+        Returns:
+            list[dict]: dataset annotations.
+        """
+        try:
+            f = self[name]
+        except KeyError as e:
+            raise KeyError(
+                "Dataset '{}' is not registered! Available datasets are: {}".format(
+                    name, ", ".join(list(self.keys()))
+                )
+            ) from e
+        return f()
+    def list(self) -> List[str]:
+        """
+        List all registered datasets.
+        Returns:
+            list[str]
+        """
+        return list(self.keys())
+    def remove(self, name):
+        """
+        Alias of ``pop``.
+        """
+        self.pop(name)
+    def __str__(self):
+        return "DatasetCatalog(registered datasets: {})".format(", ".join(self.keys()))
+    __repr__ = __str__
+DatasetCatalog = _DatasetCatalog()
+DatasetCatalog.__doc__ = (
+    _DatasetCatalog.__doc__
+    + """
+    .. automethod:: detectron2.data.catalog.DatasetCatalog.register
+    .. automethod:: detectron2.data.catalog.DatasetCatalog.get
+"""
+)
+class Metadata(types.SimpleNamespace):
+    """
+    A class that supports simple attribute setter/getter.
+    It is intended for storing metadata of a dataset and make it accessible globally.
+    Examples:
+    ::
+        # somewhere when you load the data:
+        MetadataCatalog.get("mydataset").thing_classes = ["person", "dog"]
+        # somewhere when you print statistics or visualize:
+        classes = MetadataCatalog.get("mydataset").thing_classes
+    """
+    # the name of the dataset
+    # set default to N/A so that `self.name` in the errors will not trigger getattr again
+    name: str = "N/A"
+    _RENAMED = {
+        "class_names": "thing_classes",
+        "dataset_id_to_contiguous_id": "thing_dataset_id_to_contiguous_id",
+        "stuff_class_names": "stuff_classes",
+    }
+    def __getattr__(self, key):
+        if key in self._RENAMED:
+            log_first_n(
+                logging.WARNING,
+                "Metadata '{}' was renamed to '{}'!".format(key, self._RENAMED[key]),
+                n=10,
+            )
+            return getattr(self, self._RENAMED[key])
+        # "name" exists in every metadata
+        if len(self.__dict__) > 1:
+            raise AttributeError(
+                "Attribute '{}' does not exist in the metadata of dataset '{}'. Available "
+                "keys are {}.".format(key, self.name, str(self.__dict__.keys()))
+            )
+        else:
+            raise AttributeError(
+                f"Attribute '{key}' does not exist in the metadata of dataset '{self.name}': "
+                "metadata is empty."
+            )
+    def __setattr__(self, key, val):
+        if key in self._RENAMED:
+            log_first_n(
+                logging.WARNING,
+                "Metadata '{}' was renamed to '{}'!".format(key, self._RENAMED[key]),
+                n=10,
+            )
+            setattr(self, self._RENAMED[key], val)
+        # Ensure that metadata of the same name stays consistent
+        try:
+            oldval = getattr(self, key)
+            assert oldval == val, (
+                "Attribute '{}' in the metadata of '{}' cannot be set "
+                "to a different value!\n{} != {}".format(key, self.name, oldval, val)
+            )
+        except AttributeError:
+            super().__setattr__(key, val)
+    def as_dict(self):
+        """
+        Returns all the metadata as a dict.
+        Note that modifications to the returned dict will not reflect on the Metadata object.
+        """
+        return copy.copy(self.__dict__)
+    def set(self, **kwargs):
+        """
+        Set multiple metadata with kwargs.
+        """
+        for k, v in kwargs.items():
+            setattr(self, k, v)
+        return self
+    def get(self, key, default=None):
+        """
+        Access an attribute and return its value if exists.
+        Otherwise return default.
+        """
+        try:
+            return getattr(self, key)
+        except AttributeError:
+            return default
+class _MetadataCatalog(UserDict):
+    """
+    MetadataCatalog is a global dictionary that provides access to
+    :class:`Metadata` of a given dataset.
+    The metadata associated with a certain name is a singleton: once created, the
+    metadata will stay alive and will be returned by future calls to ``get(name)``.
+    It's like global variables, so don't abuse it.
+    It's meant for storing knowledge that's constant and shared across the execution
+    of the program, e.g.: the class names in COCO.
+    """
+    def get(self, name):
+        """
+        Args:
+            name (str): name of a dataset (e.g. coco_2014_train).
+        Returns:
+            Metadata: The :class:`Metadata` instance associated with this name,
+            or create an empty one if none is available.
+        """
+        assert len(name)
+        r = super().get(name, None)
+        if r is None:
+            r = self[name] = Metadata(name=name)
+        return r
+    def list(self):
+        """
+        List all registered metadata.
+        Returns:
+            list[str]: keys (names of datasets) of all registered metadata
+        """
+        return list(self.keys())
+    def remove(self, name):
+        """
+        Alias of ``pop``.
+        """
+        self.pop(name)
+    def __str__(self):
+        return "MetadataCatalog(registered metadata: {})".format(", ".join(self.keys()))
+    __repr__ = __str__
+MetadataCatalog = _MetadataCatalog()
+MetadataCatalog.__doc__ = (
+    _MetadataCatalog.__doc__
+    + """
+    .. automethod:: detectron2.data.catalog.MetadataCatalog.get
+"""
+)
--- a/detectron2/data/common.py
+++ b/detectron2/data/common.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+import copy
+import itertools
+import logging
+import numpy as np
+import pickle
+import random
+import torch.utils.data as data
+from torch.utils.data.sampler import Sampler
+from detectron2.utils.serialize import PicklableWrapper
+__all__ = ["MapDataset", "DatasetFromList", "AspectRatioGroupedDataset", "ToIterableDataset"]
+def _shard_iterator_dataloader_worker(iterable):
+    # Shard the iterable if we're currently inside pytorch dataloader worker.
+    worker_info = data.get_worker_info()
+    if worker_info is None or worker_info.num_workers == 1:
+        # do nothing
+        yield from iterable
+    else:
+        yield from itertools.islice(iterable, worker_info.id, None, worker_info.num_workers)
+class _MapIterableDataset(data.IterableDataset):
+    """
+    Map a function over elements in an IterableDataset.
+    Similar to pytorch's MapIterDataPipe, but support filtering when map_func
+    returns None.
+    This class is not public-facing. Will be called by `MapDataset`.
+    """
+    def __init__(self, dataset, map_func):
+        self._dataset = dataset
+        self._map_func = PicklableWrapper(map_func)  # wrap so that a lambda will work
+    def __len__(self):
+        return len(self._dataset)
+    def __iter__(self):
+        for x in map(self._map_func, self._dataset):
+            if x is not None:
+                yield x
+class MapDataset(data.Dataset):
+    """
+    Map a function over the elements in a dataset.
+    """
+    def __init__(self, dataset, map_func):
+        """
+        Args:
+            dataset: a dataset where map function is applied. Can be either
+                map-style or iterable dataset. When given an iterable dataset,
+                the returned object will also be an iterable dataset.
+            map_func: a callable which maps the element in dataset. map_func can
+                return None to skip the data (e.g. in case of errors).
+                How None is handled depends on the style of `dataset`.
+                If `dataset` is map-style, it randomly tries other elements.
+                If `dataset` is iterable, it skips the data and tries the next.
+        """
+        self._dataset = dataset
+        self._map_func = PicklableWrapper(map_func)  # wrap so that a lambda will work
+        self._rng = random.Random(42)
+        self._fallback_candidates = set(range(len(dataset)))
+    def __new__(cls, dataset, map_func):
+        is_iterable = isinstance(dataset, data.IterableDataset)
+        if is_iterable:
+            return _MapIterableDataset(dataset, map_func)
+        else:
+            return super().__new__(cls)
+    def __getnewargs__(self):
+        return self._dataset, self._map_func
+    def __len__(self):
+        return len(self._dataset)
+    def __getitem__(self, idx):
+        retry_count = 0
+        cur_idx = int(idx)
+        while True:
+            data = self._map_func(self._dataset[cur_idx])
+            if data is not None:
+                self._fallback_candidates.add(cur_idx)
+                return data
+            # _map_func fails for this idx, use a random new index from the pool
+            retry_count += 1
+            self._fallback_candidates.discard(cur_idx)
+            cur_idx = self._rng.sample(self._fallback_candidates, k=1)[0]
+            if retry_count >= 3:
+                logger = logging.getLogger(__name__)
+                logger.warning(
+                    "Failed to apply `_map_func` for idx: {}, retry count: {}".format(
+                        idx, retry_count
+                    )
+                )
+class DatasetFromList(data.Dataset):
+    """
+    Wrap a list to a torch Dataset. It produces elements of the list as data.
+    """
+    def __init__(self, lst: list, copy: bool = True, serialize: bool = True):
+        """
+        Args:
+            lst (list): a list which contains elements to produce.
+            copy (bool): whether to deepcopy the element when producing it,
+                so that the result can be modified in place without affecting the
+                source in the list.
+            serialize (bool): whether to hold memory using serialized objects, when
+                enabled, data loader workers can use shared RAM from master
+                process instead of making a copy.
+        """
+        self._lst = lst
+        self._copy = copy
+        self._serialize = serialize
+        def _serialize(data):
+            buffer = pickle.dumps(data, protocol=-1)
+            return np.frombuffer(buffer, dtype=np.uint8)
+        if self._serialize:
+            logger = logging.getLogger(__name__)
+            logger.info(
+                "Serializing {} elements to byte tensors and concatenating them all ...".format(
+                    len(self._lst)
+                )
+            )
+            self._lst = [_serialize(x) for x in self._lst]
+            self._addr = np.asarray([len(x) for x in self._lst], dtype=np.int64)
+            self._addr = np.cumsum(self._addr)
+            self._lst = np.concatenate(self._lst)
+            logger.info("Serialized dataset takes {:.2f} MiB".format(len(self._lst) / 1024 ** 2))
+    def __len__(self):
+        if self._serialize:
+            return len(self._addr)
+        else:
+            return len(self._lst)
+    def __getitem__(self, idx):
+        if self._serialize:
+            start_addr = 0 if idx == 0 else self._addr[idx - 1].item()
+            end_addr = self._addr[idx].item()
+            bytes = memoryview(self._lst[start_addr:end_addr])
+            return pickle.loads(bytes)
+        elif self._copy:
+            return copy.deepcopy(self._lst[idx])
+        else:
+            return self._lst[idx]
+class ToIterableDataset(data.IterableDataset):
+    """
+    Convert an old indices-based (also called map-style) dataset
+    to an iterable-style dataset.
+    """
+    def __init__(self, dataset: data.Dataset, sampler: Sampler, shard_sampler: bool = True):
+        """
+        Args:
+            dataset: an old-style dataset with ``__getitem__``
+            sampler: a cheap iterable that produces indices to be applied on ``dataset``.
+            shard_sampler: whether to shard the sampler based on the current pytorch data loader
+                worker id. When an IterableDataset is forked by pytorch's DataLoader into multiple
+                workers, it is responsible for sharding its data based on worker id so that workers
+                don't produce identical data.
+                Most samplers (like our TrainingSampler) do not shard based on dataloader worker id
+                and this argument should be set to True. But certain samplers may be already
+                sharded, in that case this argument should be set to False.
+        """
+        assert not isinstance(dataset, data.IterableDataset), dataset
+        assert isinstance(sampler, Sampler), sampler
+        self.dataset = dataset
+        self.sampler = sampler
+        self.shard_sampler = shard_sampler
+    def __iter__(self):
+        if not self.shard_sampler:
+            sampler = self.sampler
+        else:
+            # With map-style dataset, `DataLoader(dataset, sampler)` runs the
+            # sampler in main process only. But `DataLoader(ToIterableDataset(dataset, sampler))`
+            # will run sampler in every of the N worker. So we should only keep 1/N of the ids on
+            # each worker. The assumption is that sampler is cheap to iterate so it's fine to
+            # discard ids in workers.
+            sampler = _shard_iterator_dataloader_worker(self.sampler)
+        for idx in sampler:
+            yield self.dataset[idx]
+    def __len__(self):
+        return len(self.sampler)
+class AspectRatioGroupedDataset(data.IterableDataset):
+    """
+    Batch data that have similar aspect ratio together.
+    In this implementation, images whose aspect ratio < (or >) 1 will
+    be batched together.
+    This improves training speed because the images then need less padding
+    to form a batch.
+    It assumes the underlying dataset produces dicts with "width" and "height" keys.
+    It will then produce a list of original dicts with length = batch_size,
+    all with similar aspect ratios.
+    """
+    def __init__(self, dataset, batch_size):
+        """
+        Args:
+            dataset: an iterable. Each element must be a dict with keys
+                "width" and "height", which will be used to batch data.
+            batch_size (int):
+        """
+        self.dataset = dataset
+        self.batch_size = batch_size
+        self._buckets = [[] for _ in range(2)]
+        # Hard-coded two aspect ratio groups: w > h and w < h.
+        # Can add support for more aspect ratio groups, but doesn't seem useful
+    def __iter__(self):
+        for d in self.dataset:
+            w, h = d["width"], d["height"]
+            bucket_id = 0 if w > h else 1
+            bucket = self._buckets[bucket_id]
+            bucket.append(d)
+            if len(bucket) == self.batch_size:
+                yield bucket[:]
+                del bucket[:]