catvton

3144257c · mashun1 · 3144257c · 3144257c · 3144257c · 3144257c
Commit 3144257c authored Oct 11, 2024 by mashun1
20 changed files
--- a/detectron2/evaluation/cityscapes_evaluation.py
+++ b/detectron2/evaluation/cityscapes_evaluation.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+import glob
+import logging
+import numpy as np
+import os
+import tempfile
+from collections import OrderedDict
+import torch
+from PIL import Image
+from detectron2.data import MetadataCatalog
+from detectron2.utils import comm
+from detectron2.utils.file_io import PathManager
+from .evaluator import DatasetEvaluator
+class CityscapesEvaluator(DatasetEvaluator):
+    """
+    Base class for evaluation using cityscapes API.
+    """
+    def __init__(self, dataset_name):
+        """
+        Args:
+            dataset_name (str): the name of the dataset.
+                It must have the following metadata associated with it:
+                "thing_classes", "gt_dir".
+        """
+        self._metadata = MetadataCatalog.get(dataset_name)
+        self._cpu_device = torch.device("cpu")
+        self._logger = logging.getLogger(__name__)
+    def reset(self):
+        self._working_dir = tempfile.TemporaryDirectory(prefix="cityscapes_eval_")
+        self._temp_dir = self._working_dir.name
+        # All workers will write to the same results directory
+        # TODO this does not work in distributed training
+        assert (
+            comm.get_local_size() == comm.get_world_size()
+        ), "CityscapesEvaluator currently do not work with multiple machines."
+        self._temp_dir = comm.all_gather(self._temp_dir)[0]
+        if self._temp_dir != self._working_dir.name:
+            self._working_dir.cleanup()
+        self._logger.info(
+            "Writing cityscapes results to temporary directory {} ...".format(self._temp_dir)
+        )
+class CityscapesInstanceEvaluator(CityscapesEvaluator):
+    """
+    Evaluate instance segmentation results on cityscapes dataset using cityscapes API.
+    Note:
+        * It does not work in multi-machine distributed training.
+        * It contains a synchronization, therefore has to be used on all ranks.
+        * Only the main process runs evaluation.
+    """
+    def process(self, inputs, outputs):
+        from cityscapesscripts.helpers.labels import name2label
+        for input, output in zip(inputs, outputs):
+            file_name = input["file_name"]
+            basename = os.path.splitext(os.path.basename(file_name))[0]
+            pred_txt = os.path.join(self._temp_dir, basename + "_pred.txt")
+            if "instances" in output:
+                output = output["instances"].to(self._cpu_device)
+                num_instances = len(output)
+                with open(pred_txt, "w") as fout:
+                    for i in range(num_instances):
+                        pred_class = output.pred_classes[i]
+                        classes = self._metadata.thing_classes[pred_class]
+                        class_id = name2label[classes].id
+                        score = output.scores[i]
+                        mask = output.pred_masks[i].numpy().astype("uint8")
+                        png_filename = os.path.join(
+                            self._temp_dir, basename + "_{}_{}.png".format(i, classes)
+                        )
+                        Image.fromarray(mask * 255).save(png_filename)
+                        fout.write(
+                            "{} {} {}\n".format(os.path.basename(png_filename), class_id, score)
+                        )
+            else:
+                # Cityscapes requires a prediction file for every ground truth image.
+                with open(pred_txt, "w") as fout:
+                    pass
+    def evaluate(self):
+        """
+        Returns:
+            dict: has a key "segm", whose value is a dict of "AP" and "AP50".
+        """
+        comm.synchronize()
+        if comm.get_rank() > 0:
+            return
+        import cityscapesscripts.evaluation.evalInstanceLevelSemanticLabeling as cityscapes_eval
+        self._logger.info("Evaluating results under {} ...".format(self._temp_dir))
+        # set some global states in cityscapes evaluation API, before evaluating
+        cityscapes_eval.args.predictionPath = os.path.abspath(self._temp_dir)
+        cityscapes_eval.args.predictionWalk = None
+        cityscapes_eval.args.JSONOutput = False
+        cityscapes_eval.args.colorized = False
+        cityscapes_eval.args.gtInstancesFile = os.path.join(self._temp_dir, "gtInstances.json")
+        # These lines are adopted from
+        # https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/evaluation/evalInstanceLevelSemanticLabeling.py # noqa
+        gt_dir = PathManager.get_local_path(self._metadata.gt_dir)
+        groundTruthImgList = glob.glob(os.path.join(gt_dir, "*", "*_gtFine_instanceIds.png"))
+        assert len(
+            groundTruthImgList
+        ), "Cannot find any ground truth images to use for evaluation. Searched for: {}".format(
+            cityscapes_eval.args.groundTruthSearch
+        )
+        predictionImgList = []
+        for gt in groundTruthImgList:
+            predictionImgList.append(cityscapes_eval.getPrediction(gt, cityscapes_eval.args))
+        results = cityscapes_eval.evaluateImgLists(
+            predictionImgList, groundTruthImgList, cityscapes_eval.args
+        )["averages"]
+        ret = OrderedDict()
+        ret["segm"] = {"AP": results["allAp"] * 100, "AP50": results["allAp50%"] * 100}
+        self._working_dir.cleanup()
+        return ret
+class CityscapesSemSegEvaluator(CityscapesEvaluator):
+    """
+    Evaluate semantic segmentation results on cityscapes dataset using cityscapes API.
+    Note:
+        * It does not work in multi-machine distributed training.
+        * It contains a synchronization, therefore has to be used on all ranks.
+        * Only the main process runs evaluation.
+    """
+    def process(self, inputs, outputs):
+        from cityscapesscripts.helpers.labels import trainId2label
+        for input, output in zip(inputs, outputs):
+            file_name = input["file_name"]
+            basename = os.path.splitext(os.path.basename(file_name))[0]
+            pred_filename = os.path.join(self._temp_dir, basename + "_pred.png")
+            output = output["sem_seg"].argmax(dim=0).to(self._cpu_device).numpy()
+            pred = 255 * np.ones(output.shape, dtype=np.uint8)
+            for train_id, label in trainId2label.items():
+                if label.ignoreInEval:
+                    continue
+                pred[output == train_id] = label.id
+            Image.fromarray(pred).save(pred_filename)
+    def evaluate(self):
+        comm.synchronize()
+        if comm.get_rank() > 0:
+            return
+        # Load the Cityscapes eval script *after* setting the required env var,
+        # since the script reads CITYSCAPES_DATASET into global variables at load time.
+        import cityscapesscripts.evaluation.evalPixelLevelSemanticLabeling as cityscapes_eval
+        self._logger.info("Evaluating results under {} ...".format(self._temp_dir))
+        # set some global states in cityscapes evaluation API, before evaluating
+        cityscapes_eval.args.predictionPath = os.path.abspath(self._temp_dir)
+        cityscapes_eval.args.predictionWalk = None
+        cityscapes_eval.args.JSONOutput = False
+        cityscapes_eval.args.colorized = False
+        # These lines are adopted from
+        # https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/evaluation/evalPixelLevelSemanticLabeling.py # noqa
+        gt_dir = PathManager.get_local_path(self._metadata.gt_dir)
+        groundTruthImgList = glob.glob(os.path.join(gt_dir, "*", "*_gtFine_labelIds.png"))
+        assert len(
+            groundTruthImgList
+        ), "Cannot find any ground truth images to use for evaluation. Searched for: {}".format(
+            cityscapes_eval.args.groundTruthSearch
+        )
+        predictionImgList = []
+        for gt in groundTruthImgList:
+            predictionImgList.append(cityscapes_eval.getPrediction(cityscapes_eval.args, gt))
+        results = cityscapes_eval.evaluateImgLists(
+            predictionImgList, groundTruthImgList, cityscapes_eval.args
+        )
+        ret = OrderedDict()
+        ret["sem_seg"] = {
+            "IoU": 100.0 * results["averageScoreClasses"],
+            "iIoU": 100.0 * results["averageScoreInstClasses"],
+            "IoU_sup": 100.0 * results["averageScoreCategories"],
+            "iIoU_sup": 100.0 * results["averageScoreInstCategories"],
+        }
+        self._working_dir.cleanup()
+        return ret
--- a/detectron2/evaluation/coco_evaluation.py
+++ b/detectron2/evaluation/coco_evaluation.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+import contextlib
+import copy
+import io
+import itertools
+import json
+import logging
+import numpy as np
+import os
+import pickle
+from collections import OrderedDict
+import pycocotools.mask as mask_util
+import torch
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+from tabulate import tabulate
+import detectron2.utils.comm as comm
+from detectron2.config import CfgNode
+from detectron2.data import MetadataCatalog
+from detectron2.data.datasets.coco import convert_to_coco_json
+from detectron2.structures import Boxes, BoxMode, pairwise_iou
+from detectron2.utils.file_io import PathManager
+from detectron2.utils.logger import create_small_table
+from .evaluator import DatasetEvaluator
+try:
+    from detectron2.evaluation.fast_eval_api import COCOeval_opt
+except ImportError:
+    COCOeval_opt = COCOeval
+class COCOEvaluator(DatasetEvaluator):
+    """
+    Evaluate AR for object proposals, AP for instance detection/segmentation, AP
+    for keypoint detection outputs using COCO's metrics.
+    See http://cocodataset.org/#detection-eval and
+    http://cocodataset.org/#keypoints-eval to understand its metrics.
+    The metrics range from 0 to 100 (instead of 0 to 1), where a -1 or NaN means
+    the metric cannot be computed (e.g. due to no predictions made).
+    In addition to COCO, this evaluator is able to support any bounding box detection,
+    instance segmentation, or keypoint detection dataset.
+    """
+    def __init__(
+        self,
+        dataset_name,
+        tasks=None,
+        distributed=True,
+        output_dir=None,
+        *,
+        max_dets_per_image=None,
+        use_fast_impl=True,
+        kpt_oks_sigmas=(),
+        allow_cached_coco=True,
+    ):
+        """
+        Args:
+            dataset_name (str): name of the dataset to be evaluated.
+                It must have either the following corresponding metadata:
+                    "json_file": the path to the COCO format annotation
+                Or it must be in detectron2's standard dataset format
+                so it can be converted to COCO format automatically.
+            tasks (tuple[str]): tasks that can be evaluated under the given
+                configuration. A task is one of "bbox", "segm", "keypoints".
+                By default, will infer this automatically from predictions.
+            distributed (True): if True, will collect results from all ranks and run evaluation
+                in the main process.
+                Otherwise, will only evaluate the results in the current process.
+            output_dir (str): optional, an output directory to dump all
+                results predicted on the dataset. The dump contains two files:
+                1. "instances_predictions.pth" a file that can be loaded with `torch.load` and
+                   contains all the results in the format they are produced by the model.
+                2. "coco_instances_results.json" a json file in COCO's result format.
+            max_dets_per_image (int): limit on the maximum number of detections per image.
+                By default in COCO, this limit is to 100, but this can be customized
+                to be greater, as is needed in evaluation metrics AP fixed and AP pool
+                (see https://arxiv.org/pdf/2102.01066.pdf)
+                This doesn't affect keypoint evaluation.
+            use_fast_impl (bool): use a fast but **unofficial** implementation to compute AP.
+                Although the results should be very close to the official implementation in COCO
+                API, it is still recommended to compute results with the official API for use in
+                papers. The faster implementation also uses more RAM.
+            kpt_oks_sigmas (list[float]): The sigmas used to calculate keypoint OKS.
+                See http://cocodataset.org/#keypoints-eval
+                When empty, it will use the defaults in COCO.
+                Otherwise it should be the same length as ROI_KEYPOINT_HEAD.NUM_KEYPOINTS.
+            allow_cached_coco (bool): Whether to use cached coco json from previous validation
+                runs. You should set this to False if you need to use different validation data.
+                Defaults to True.
+        """
+        self._logger = logging.getLogger(__name__)
+        self._distributed = distributed
+        self._output_dir = output_dir
+        if use_fast_impl and (COCOeval_opt is COCOeval):
+            self._logger.info("Fast COCO eval is not built. Falling back to official COCO eval.")
+            use_fast_impl = False
+        self._use_fast_impl = use_fast_impl
+        # COCOeval requires the limit on the number of detections per image (maxDets) to be a list
+        # with at least 3 elements. The default maxDets in COCOeval is [1, 10, 100], in which the
+        # 3rd element (100) is used as the limit on the number of detections per image when
+        # evaluating AP. COCOEvaluator expects an integer for max_dets_per_image, so for COCOeval,
+        # we reformat max_dets_per_image into [1, 10, max_dets_per_image], based on the defaults.
+        if max_dets_per_image is None:
+            max_dets_per_image = [1, 10, 100]
+        else:
+            max_dets_per_image = [1, 10, max_dets_per_image]
+        self._max_dets_per_image = max_dets_per_image
+        if tasks is not None and isinstance(tasks, CfgNode):
+            kpt_oks_sigmas = (
+                tasks.TEST.KEYPOINT_OKS_SIGMAS if not kpt_oks_sigmas else kpt_oks_sigmas
+            )
+            self._logger.warn(
+                "COCO Evaluator instantiated using config, this is deprecated behavior."
+                " Please pass in explicit arguments instead."
+            )
+            self._tasks = None  # Infering it from predictions should be better
+        else:
+            self._tasks = tasks
+        self._cpu_device = torch.device("cpu")
+        self._metadata = MetadataCatalog.get(dataset_name)
+        if not hasattr(self._metadata, "json_file"):
+            if output_dir is None:
+                raise ValueError(
+                    "output_dir must be provided to COCOEvaluator "
+                    "for datasets not in COCO format."
+                )
+            self._logger.info(f"Trying to convert '{dataset_name}' to COCO format ...")
+            cache_path = os.path.join(output_dir, f"{dataset_name}_coco_format.json")
+            self._metadata.json_file = cache_path
+            convert_to_coco_json(dataset_name, cache_path, allow_cached=allow_cached_coco)
+        json_file = PathManager.get_local_path(self._metadata.json_file)
+        with contextlib.redirect_stdout(io.StringIO()):
+            self._coco_api = COCO(json_file)
+        # Test set json files do not contain annotations (evaluation must be
+        # performed using the COCO evaluation server).
+        self._do_evaluation = "annotations" in self._coco_api.dataset
+        if self._do_evaluation:
+            self._kpt_oks_sigmas = kpt_oks_sigmas
+    def reset(self):
+        self._predictions = []
+    def process(self, inputs, outputs):
+        """
+        Args:
+            inputs: the inputs to a COCO model (e.g., GeneralizedRCNN).
+                It is a list of dict. Each dict corresponds to an image and
+                contains keys like "height", "width", "file_name", "image_id".
+            outputs: the outputs of a COCO model. It is a list of dicts with key
+                "instances" that contains :class:`Instances`.
+        """
+        for input, output in zip(inputs, outputs):
+            prediction = {"image_id": input["image_id"]}
+            if "instances" in output:
+                instances = output["instances"].to(self._cpu_device)
+                prediction["instances"] = instances_to_coco_json(instances, input["image_id"])
+            if "proposals" in output:
+                prediction["proposals"] = output["proposals"].to(self._cpu_device)
+            if len(prediction) > 1:
+                self._predictions.append(prediction)
+    def evaluate(self, img_ids=None):
+        """
+        Args:
+            img_ids: a list of image IDs to evaluate on. Default to None for the whole dataset
+        """
+        if self._distributed:
+            comm.synchronize()
+            predictions = comm.gather(self._predictions, dst=0)
+            predictions = list(itertools.chain(*predictions))
+            if not comm.is_main_process():
+                return {}
+        else:
+            predictions = self._predictions
+        if len(predictions) == 0:
+            self._logger.warning("[COCOEvaluator] Did not receive valid predictions.")
+            return {}
+        if self._output_dir:
+            PathManager.mkdirs(self._output_dir)
+            file_path = os.path.join(self._output_dir, "instances_predictions.pth")
+            with PathManager.open(file_path, "wb") as f:
+                torch.save(predictions, f)
+        self._results = OrderedDict()
+        if "proposals" in predictions[0]:
+            self._eval_box_proposals(predictions)
+        if "instances" in predictions[0]:
+            self._eval_predictions(predictions, img_ids=img_ids)
+        # Copy so the caller can do whatever with results
+        return copy.deepcopy(self._results)
+    def _tasks_from_predictions(self, predictions):
+        """
+        Get COCO API "tasks" (i.e. iou_type) from COCO-format predictions.
+        """
+        tasks = {"bbox"}
+        for pred in predictions:
+            if "segmentation" in pred:
+                tasks.add("segm")
+            if "keypoints" in pred:
+                tasks.add("keypoints")
+        return sorted(tasks)
+    def _eval_predictions(self, predictions, img_ids=None):
+        """
+        Evaluate predictions. Fill self._results with the metrics of the tasks.
+        """
+        self._logger.info("Preparing results for COCO format ...")
+        coco_results = list(itertools.chain(*[x["instances"] for x in predictions]))
+        tasks = self._tasks or self._tasks_from_predictions(coco_results)
+        # unmap the category ids for COCO
+        if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"):
+            dataset_id_to_contiguous_id = self._metadata.thing_dataset_id_to_contiguous_id
+            all_contiguous_ids = list(dataset_id_to_contiguous_id.values())
+            num_classes = len(all_contiguous_ids)
+            assert min(all_contiguous_ids) == 0 and max(all_contiguous_ids) == num_classes - 1
+            reverse_id_mapping = {v: k for k, v in dataset_id_to_contiguous_id.items()}
+            for result in coco_results:
+                category_id = result["category_id"]
+                assert category_id < num_classes, (
+                    f"A prediction has class={category_id}, "
+                    f"but the dataset only has {num_classes} classes and "
+                    f"predicted class id should be in [0, {num_classes - 1}]."
+                )
+                result["category_id"] = reverse_id_mapping[category_id]
+        if self._output_dir:
+            file_path = os.path.join(self._output_dir, "coco_instances_results.json")
+            self._logger.info("Saving results to {}".format(file_path))
+            with PathManager.open(file_path, "w") as f:
+                f.write(json.dumps(coco_results))
+                f.flush()
+        if not self._do_evaluation:
+            self._logger.info("Annotations are not available for evaluation.")
+            return
+        self._logger.info(
+            "Evaluating predictions with {} COCO API...".format(
+                "unofficial" if self._use_fast_impl else "official"
+            )
+        )
+        for task in sorted(tasks):
+            assert task in {"bbox", "segm", "keypoints"}, f"Got unknown task: {task}!"
+            coco_eval = (
+                _evaluate_predictions_on_coco(
+                    self._coco_api,
+                    coco_results,
+                    task,
+                    kpt_oks_sigmas=self._kpt_oks_sigmas,
+                    cocoeval_fn=COCOeval_opt if self._use_fast_impl else COCOeval,
+                    img_ids=img_ids,
+                    max_dets_per_image=self._max_dets_per_image,
+                )
+                if len(coco_results) > 0
+                else None  # cocoapi does not handle empty results very well
+            )
+            res = self._derive_coco_results(
+                coco_eval, task, class_names=self._metadata.get("thing_classes")
+            )
+            self._results[task] = res
+    def _eval_box_proposals(self, predictions):
+        """
+        Evaluate the box proposals in predictions.
+        Fill self._results with the metrics for "box_proposals" task.
+        """
+        if self._output_dir:
+            # Saving generated box proposals to file.
+            # Predicted box_proposals are in XYXY_ABS mode.
+            bbox_mode = BoxMode.XYXY_ABS.value
+            ids, boxes, objectness_logits = [], [], []
+            for prediction in predictions:
+                ids.append(prediction["image_id"])
+                boxes.append(prediction["proposals"].proposal_boxes.tensor.numpy())
+                objectness_logits.append(prediction["proposals"].objectness_logits.numpy())
+            proposal_data = {
+                "boxes": boxes,
+                "objectness_logits": objectness_logits,
+                "ids": ids,
+                "bbox_mode": bbox_mode,
+            }
+            with PathManager.open(os.path.join(self._output_dir, "box_proposals.pkl"), "wb") as f:
+                pickle.dump(proposal_data, f)
+        if not self._do_evaluation:
+            self._logger.info("Annotations are not available for evaluation.")
+            return
+        self._logger.info("Evaluating bbox proposals ...")
+        res = {}
+        areas = {"all": "", "small": "s", "medium": "m", "large": "l"}
+        for limit in [100, 1000]:
+            for area, suffix in areas.items():
+                stats = _evaluate_box_proposals(predictions, self._coco_api, area=area, limit=limit)
+                key = "AR{}@{:d}".format(suffix, limit)
+                res[key] = float(stats["ar"].item() * 100)
+        self._logger.info("Proposal metrics: \n" + create_small_table(res))
+        self._results["box_proposals"] = res
+    def _derive_coco_results(self, coco_eval, iou_type, class_names=None):
+        """
+        Derive the desired score numbers from summarized COCOeval.
+        Args:
+            coco_eval (None or COCOEval): None represents no predictions from model.
+            iou_type (str):
+            class_names (None or list[str]): if provided, will use it to predict
+                per-category AP.
+        Returns:
+            a dict of {metric name: score}
+        """
+        metrics = {
+            "bbox": ["AP", "AP50", "AP75", "APs", "APm", "APl"],
+            "segm": ["AP", "AP50", "AP75", "APs", "APm", "APl"],
+            "keypoints": ["AP", "AP50", "AP75", "APm", "APl"],
+        }[iou_type]
+        if coco_eval is None:
+            self._logger.warn("No predictions from the model!")
+            return {metric: float("nan") for metric in metrics}
+        # the standard metrics
+        results = {
+            metric: float(coco_eval.stats[idx] * 100 if coco_eval.stats[idx] >= 0 else "nan")
+            for idx, metric in enumerate(metrics)
+        }
+        self._logger.info(
+            "Evaluation results for {}: \n".format(iou_type) + create_small_table(results)
+        )
+        if not np.isfinite(sum(results.values())):
+            self._logger.info("Some metrics cannot be computed and is shown as NaN.")
+        if class_names is None or len(class_names) <= 1:
+            return results
+        # Compute per-category AP
+        # from https://github.com/facebookresearch/Detectron/blob/a6a835f5b8208c45d0dce217ce9bbda915f44df7/detectron/datasets/json_dataset_evaluator.py#L222-L252 # noqa
+        precisions = coco_eval.eval["precision"]
+        # precision has dims (iou, recall, cls, area range, max dets)
+        assert len(class_names) == precisions.shape[2]
+        results_per_category = []
+        for idx, name in enumerate(class_names):
+            # area range index 0: all area ranges
+            # max dets index -1: typically 100 per image
+            precision = precisions[:, :, idx, 0, -1]
+            precision = precision[precision > -1]
+            ap = np.mean(precision) if precision.size else float("nan")
+            results_per_category.append(("{}".format(name), float(ap * 100)))
+        # tabulate it
+        N_COLS = min(6, len(results_per_category) * 2)
+        results_flatten = list(itertools.chain(*results_per_category))
+        results_2d = itertools.zip_longest(*[results_flatten[i::N_COLS] for i in range(N_COLS)])
+        table = tabulate(
+            results_2d,
+            tablefmt="pipe",
+            floatfmt=".3f",
+            headers=["category", "AP"] * (N_COLS // 2),
+            numalign="left",
+        )
+        self._logger.info("Per-category {} AP: \n".format(iou_type) + table)
+        results.update({"AP-" + name: ap for name, ap in results_per_category})
+        return results
+def instances_to_coco_json(instances, img_id):
+    """
+    Dump an "Instances" object to a COCO-format json that's used for evaluation.
+    Args:
+        instances (Instances):
+        img_id (int): the image id
+    Returns:
+        list[dict]: list of json annotations in COCO format.
+    """
+    num_instance = len(instances)
+    if num_instance == 0:
+        return []
+    boxes = instances.pred_boxes.tensor.numpy()
+    boxes = BoxMode.convert(boxes, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
+    boxes = boxes.tolist()
+    scores = instances.scores.tolist()
+    classes = instances.pred_classes.tolist()
+    has_mask = instances.has("pred_masks")
+    if has_mask:
+        # use RLE to encode the masks, because they are too large and takes memory
+        # since this evaluator stores outputs of the entire dataset
+        rles = [
+            mask_util.encode(np.array(mask[:, :, None], order="F", dtype="uint8"))[0]
+            for mask in instances.pred_masks
+        ]
+        for rle in rles:
+            # "counts" is an array encoded by mask_util as a byte-stream. Python3's
+            # json writer which always produces strings cannot serialize a bytestream
+            # unless you decode it. Thankfully, utf-8 works out (which is also what
+            # the pycocotools/_mask.pyx does).
+            rle["counts"] = rle["counts"].decode("utf-8")
+    has_keypoints = instances.has("pred_keypoints")
+    if has_keypoints:
+        keypoints = instances.pred_keypoints
+    results = []
+    for k in range(num_instance):
+        result = {
+            "image_id": img_id,
+            "category_id": classes[k],
+            "bbox": boxes[k],
+            "score": scores[k],
+        }
+        if has_mask:
+            result["segmentation"] = rles[k]
+        if has_keypoints:
+            # In COCO annotations,
+            # keypoints coordinates are pixel indices.
+            # However our predictions are floating point coordinates.
+            # Therefore we subtract 0.5 to be consistent with the annotation format.
+            # This is the inverse of data loading logic in `datasets/coco.py`.
+            keypoints[k][:, :2] -= 0.5
+            result["keypoints"] = keypoints[k].flatten().tolist()
+        results.append(result)
+    return results
+# inspired from Detectron:
+# https://github.com/facebookresearch/Detectron/blob/a6a835f5b8208c45d0dce217ce9bbda915f44df7/detectron/datasets/json_dataset_evaluator.py#L255 # noqa
+def _evaluate_box_proposals(dataset_predictions, coco_api, thresholds=None, area="all", limit=None):
+    """
+    Evaluate detection proposal recall metrics. This function is a much
+    faster alternative to the official COCO API recall evaluation code. However,
+    it produces slightly different results.
+    """
+    # Record max overlap value for each gt box
+    # Return vector of overlap values
+    areas = {
+        "all": 0,
+        "small": 1,
+        "medium": 2,
+        "large": 3,
+        "96-128": 4,
+        "128-256": 5,
+        "256-512": 6,
+        "512-inf": 7,
+    }
+    area_ranges = [
+        [0**2, 1e5**2],  # all
+        [0**2, 32**2],  # small
+        [32**2, 96**2],  # medium
+        [96**2, 1e5**2],  # large
+        [96**2, 128**2],  # 96-128
+        [128**2, 256**2],  # 128-256
+        [256**2, 512**2],  # 256-512
+        [512**2, 1e5**2],
+    ]  # 512-inf
+    assert area in areas, "Unknown area range: {}".format(area)
+    area_range = area_ranges[areas[area]]
+    gt_overlaps = []
+    num_pos = 0
+    for prediction_dict in dataset_predictions:
+        predictions = prediction_dict["proposals"]
+        # sort predictions in descending order
+        # TODO maybe remove this and make it explicit in the documentation
+        inds = predictions.objectness_logits.sort(descending=True)[1]
+        predictions = predictions[inds]
+        ann_ids = coco_api.getAnnIds(imgIds=prediction_dict["image_id"])
+        anno = coco_api.loadAnns(ann_ids)
+        gt_boxes = [
+            BoxMode.convert(obj["bbox"], BoxMode.XYWH_ABS, BoxMode.XYXY_ABS)
+            for obj in anno
+            if obj["iscrowd"] == 0
+        ]
+        gt_boxes = torch.as_tensor(gt_boxes).reshape(-1, 4)  # guard against no boxes
+        gt_boxes = Boxes(gt_boxes)
+        gt_areas = torch.as_tensor([obj["area"] for obj in anno if obj["iscrowd"] == 0])
+        if len(gt_boxes) == 0 or len(predictions) == 0:
+            continue
+        valid_gt_inds = (gt_areas >= area_range[0]) & (gt_areas <= area_range[1])
+        gt_boxes = gt_boxes[valid_gt_inds]
+        num_pos += len(gt_boxes)
+        if len(gt_boxes) == 0:
+            continue
+        if limit is not None and len(predictions) > limit:
+            predictions = predictions[:limit]
+        overlaps = pairwise_iou(predictions.proposal_boxes, gt_boxes)
+        _gt_overlaps = torch.zeros(len(gt_boxes))
+        for j in range(min(len(predictions), len(gt_boxes))):
+            # find which proposal box maximally covers each gt box
+            # and get the iou amount of coverage for each gt box
+            max_overlaps, argmax_overlaps = overlaps.max(dim=0)
+            # find which gt box is 'best' covered (i.e. 'best' = most iou)
+            gt_ovr, gt_ind = max_overlaps.max(dim=0)
+            assert gt_ovr >= 0
+            # find the proposal box that covers the best covered gt box
+            box_ind = argmax_overlaps[gt_ind]
+            # record the iou coverage of this gt box
+            _gt_overlaps[j] = overlaps[box_ind, gt_ind]
+            assert _gt_overlaps[j] == gt_ovr
+            # mark the proposal box and the gt box as used
+            overlaps[box_ind, :] = -1
+            overlaps[:, gt_ind] = -1
+        # append recorded iou coverage level
+        gt_overlaps.append(_gt_overlaps)
+    gt_overlaps = (
+        torch.cat(gt_overlaps, dim=0) if len(gt_overlaps) else torch.zeros(0, dtype=torch.float32)
+    )
+    gt_overlaps, _ = torch.sort(gt_overlaps)
+    if thresholds is None:
+        step = 0.05
+        thresholds = torch.arange(0.5, 0.95 + 1e-5, step, dtype=torch.float32)
+    recalls = torch.zeros_like(thresholds)
+    # compute recall for each iou threshold
+    for i, t in enumerate(thresholds):
+        recalls[i] = (gt_overlaps >= t).float().sum() / float(num_pos)
+    # ar = 2 * np.trapz(recalls, thresholds)
+    ar = recalls.mean()
+    return {
+        "ar": ar,
+        "recalls": recalls,
+        "thresholds": thresholds,
+        "gt_overlaps": gt_overlaps,
+        "num_pos": num_pos,
+    }
+def _evaluate_predictions_on_coco(
+    coco_gt,
+    coco_results,
+    iou_type,
+    kpt_oks_sigmas=None,
+    cocoeval_fn=COCOeval_opt,
+    img_ids=None,
+    max_dets_per_image=None,
+):
+    """
+    Evaluate the coco results using COCOEval API.
+    """
+    assert len(coco_results) > 0
+    if iou_type == "segm":
+        coco_results = copy.deepcopy(coco_results)
+        # When evaluating mask AP, if the results contain bbox, cocoapi will
+        # use the box area as the area of the instance, instead of the mask area.
+        # This leads to a different definition of small/medium/large.
+        # We remove the bbox field to let mask AP use mask area.
+        for c in coco_results:
+            c.pop("bbox", None)
+    coco_dt = coco_gt.loadRes(coco_results)
+    coco_eval = cocoeval_fn(coco_gt, coco_dt, iou_type)
+    # For COCO, the default max_dets_per_image is [1, 10, 100].
+    if max_dets_per_image is None:
+        max_dets_per_image = [1, 10, 100]  # Default from COCOEval
+    else:
+        assert (
+            len(max_dets_per_image) >= 3
+        ), "COCOeval requires maxDets (and max_dets_per_image) to have length at least 3"
+        # In the case that user supplies a custom input for max_dets_per_image,
+        # apply COCOevalMaxDets to evaluate AP with the custom input.
+        if max_dets_per_image[2] != 100:
+            coco_eval = COCOevalMaxDets(coco_gt, coco_dt, iou_type)
+    if iou_type != "keypoints":
+        coco_eval.params.maxDets = max_dets_per_image
+    if img_ids is not None:
+        coco_eval.params.imgIds = img_ids
+    if iou_type == "keypoints":
+        # Use the COCO default keypoint OKS sigmas unless overrides are specified
+        if kpt_oks_sigmas:
+            assert hasattr(coco_eval.params, "kpt_oks_sigmas"), "pycocotools is too old!"
+            coco_eval.params.kpt_oks_sigmas = np.array(kpt_oks_sigmas)
+        # COCOAPI requires every detection and every gt to have keypoints, so
+        # we just take the first entry from both
+        num_keypoints_dt = len(coco_results[0]["keypoints"]) // 3
+        num_keypoints_gt = len(next(iter(coco_gt.anns.values()))["keypoints"]) // 3
+        num_keypoints_oks = len(coco_eval.params.kpt_oks_sigmas)
+        assert num_keypoints_oks == num_keypoints_dt == num_keypoints_gt, (
+            f"[COCOEvaluator] Prediction contain {num_keypoints_dt} keypoints. "
+            f"Ground truth contains {num_keypoints_gt} keypoints. "
+            f"The length of cfg.TEST.KEYPOINT_OKS_SIGMAS is {num_keypoints_oks}. "
+            "They have to agree with each other. For meaning of OKS, please refer to "
+            "http://cocodataset.org/#keypoints-eval."
+        )
+    coco_eval.evaluate()
+    coco_eval.accumulate()
+    coco_eval.summarize()
+    return coco_eval
+class COCOevalMaxDets(COCOeval):
+    """
+    Modified version of COCOeval for evaluating AP with a custom
+    maxDets (by default for COCO, maxDets is 100)
+    """
+    def summarize(self):
+        """
+        Compute and display summary metrics for evaluation results given
+        a custom value for  max_dets_per_image
+        """
+        def _summarize(ap=1, iouThr=None, areaRng="all", maxDets=100):
+            p = self.params
+            iStr = " {:<18} {} @[ IoU={:<9} | area={:>6s} | maxDets={:>3d} ] = {:0.3f}"
+            titleStr = "Average Precision" if ap == 1 else "Average Recall"
+            typeStr = "(AP)" if ap == 1 else "(AR)"
+            iouStr = (
+                "{:0.2f}:{:0.2f}".format(p.iouThrs[0], p.iouThrs[-1])
+                if iouThr is None
+                else "{:0.2f}".format(iouThr)
+            )
+            aind = [i for i, aRng in enumerate(p.areaRngLbl) if aRng == areaRng]
+            mind = [i for i, mDet in enumerate(p.maxDets) if mDet == maxDets]
+            if ap == 1:
+                # dimension of precision: [TxRxKxAxM]
+                s = self.eval["precision"]
+                # IoU
+                if iouThr is not None:
+                    t = np.where(iouThr == p.iouThrs)[0]
+                    s = s[t]
+                s = s[:, :, :, aind, mind]
+            else:
+                # dimension of recall: [TxKxAxM]
+                s = self.eval["recall"]
+                if iouThr is not None:
+                    t = np.where(iouThr == p.iouThrs)[0]
+                    s = s[t]
+                s = s[:, :, aind, mind]
+            if len(s[s > -1]) == 0:
+                mean_s = -1
+            else:
+                mean_s = np.mean(s[s > -1])
+            print(iStr.format(titleStr, typeStr, iouStr, areaRng, maxDets, mean_s))
+            return mean_s
+        def _summarizeDets():
+            stats = np.zeros((12,))
+            # Evaluate AP using the custom limit on maximum detections per image
+            stats[0] = _summarize(1, maxDets=self.params.maxDets[2])
+            stats[1] = _summarize(1, iouThr=0.5, maxDets=self.params.maxDets[2])
+            stats[2] = _summarize(1, iouThr=0.75, maxDets=self.params.maxDets[2])
+            stats[3] = _summarize(1, areaRng="small", maxDets=self.params.maxDets[2])
+            stats[4] = _summarize(1, areaRng="medium", maxDets=self.params.maxDets[2])
+            stats[5] = _summarize(1, areaRng="large", maxDets=self.params.maxDets[2])
+            stats[6] = _summarize(0, maxDets=self.params.maxDets[0])
+            stats[7] = _summarize(0, maxDets=self.params.maxDets[1])
+            stats[8] = _summarize(0, maxDets=self.params.maxDets[2])
+            stats[9] = _summarize(0, areaRng="small", maxDets=self.params.maxDets[2])
+            stats[10] = _summarize(0, areaRng="medium", maxDets=self.params.maxDets[2])
+            stats[11] = _summarize(0, areaRng="large", maxDets=self.params.maxDets[2])
+            return stats
+        def _summarizeKps():
+            stats = np.zeros((10,))
+            stats[0] = _summarize(1, maxDets=20)
+            stats[1] = _summarize(1, maxDets=20, iouThr=0.5)
+            stats[2] = _summarize(1, maxDets=20, iouThr=0.75)
+            stats[3] = _summarize(1, maxDets=20, areaRng="medium")
+            stats[4] = _summarize(1, maxDets=20, areaRng="large")
+            stats[5] = _summarize(0, maxDets=20)
+            stats[6] = _summarize(0, maxDets=20, iouThr=0.5)
+            stats[7] = _summarize(0, maxDets=20, iouThr=0.75)
+            stats[8] = _summarize(0, maxDets=20, areaRng="medium")
+            stats[9] = _summarize(0, maxDets=20, areaRng="large")
+            return stats
+        if not self.eval:
+            raise Exception("Please run accumulate() first")
+        iouType = self.params.iouType
+        if iouType == "segm" or iouType == "bbox":
+            summarize = _summarizeDets
+        elif iouType == "keypoints":
+            summarize = _summarizeKps
+        self.stats = summarize()
+    def __str__(self):
+        self.summarize()
--- a/detectron2/evaluation/evaluator.py
+++ b/detectron2/evaluation/evaluator.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+import datetime
+import logging
+import time
+from collections import OrderedDict, abc
+from contextlib import ExitStack, contextmanager
+from typing import List, Union
+import torch
+from torch import nn
+from detectron2.utils.comm import get_world_size, is_main_process
+from detectron2.utils.logger import log_every_n_seconds
+class DatasetEvaluator:
+    """
+    Base class for a dataset evaluator.
+    The function :func:`inference_on_dataset` runs the model over
+    all samples in the dataset, and have a DatasetEvaluator to process the inputs/outputs.
+    This class will accumulate information of the inputs/outputs (by :meth:`process`),
+    and produce evaluation results in the end (by :meth:`evaluate`).
+    """
+    def reset(self):
+        """
+        Preparation for a new round of evaluation.
+        Should be called before starting a round of evaluation.
+        """
+        pass
+    def process(self, inputs, outputs):
+        """
+        Process the pair of inputs and outputs.
+        If they contain batches, the pairs can be consumed one-by-one using `zip`:
+        .. code-block:: python
+            for input_, output in zip(inputs, outputs):
+                # do evaluation on single input/output pair
+                ...
+        Args:
+            inputs (list): the inputs that's used to call the model.
+            outputs (list): the return value of `model(inputs)`
+        """
+        pass
+    def evaluate(self):
+        """
+        Evaluate/summarize the performance, after processing all input/output pairs.
+        Returns:
+            dict:
+                A new evaluator class can return a dict of arbitrary format
+                as long as the user can process the results.
+                In our train_net.py, we expect the following format:
+                * key: the name of the task (e.g., bbox)
+                * value: a dict of {metric name: score}, e.g.: {"AP50": 80}
+        """
+        pass
+class DatasetEvaluators(DatasetEvaluator):
+    """
+    Wrapper class to combine multiple :class:`DatasetEvaluator` instances.
+    This class dispatches every evaluation call to
+    all of its :class:`DatasetEvaluator`.
+    """
+    def __init__(self, evaluators):
+        """
+        Args:
+            evaluators (list): the evaluators to combine.
+        """
+        super().__init__()
+        self._evaluators = evaluators
+    def reset(self):
+        for evaluator in self._evaluators:
+            evaluator.reset()
+    def process(self, inputs, outputs):
+        for evaluator in self._evaluators:
+            evaluator.process(inputs, outputs)
+    def evaluate(self):
+        results = OrderedDict()
+        for evaluator in self._evaluators:
+            result = evaluator.evaluate()
+            if is_main_process() and result is not None:
+                for k, v in result.items():
+                    assert (
+                        k not in results
+                    ), "Different evaluators produce results with the same key {}".format(k)
+                    results[k] = v
+        return results
+def inference_on_dataset(
+    model,
+    data_loader,
+    evaluator: Union[DatasetEvaluator, List[DatasetEvaluator], None],
+    callbacks=None,
+):
+    """
+    Run model on the data_loader and evaluate the metrics with evaluator.
+    Also benchmark the inference speed of `model.__call__` accurately.
+    The model will be used in eval mode.
+    Args:
+        model (callable): a callable which takes an object from
+            `data_loader` and returns some outputs.
+            If it's an nn.Module, it will be temporarily set to `eval` mode.
+            If you wish to evaluate a model in `training` mode instead, you can
+            wrap the given model and override its behavior of `.eval()` and `.train()`.
+        data_loader: an iterable object with a length.
+            The elements it generates will be the inputs to the model.
+        evaluator: the evaluator(s) to run. Use `None` if you only want to benchmark,
+            but don't want to do any evaluation.
+        callbacks (dict of callables): a dictionary of callback functions which can be
+            called at each stage of inference.
+    Returns:
+        The return value of `evaluator.evaluate()`
+    """
+    num_devices = get_world_size()
+    logger = logging.getLogger(__name__)
+    logger.info("Start inference on {} batches".format(len(data_loader)))
+    total = len(data_loader)  # inference data loader must have a fixed length
+    if evaluator is None:
+        # create a no-op evaluator
+        evaluator = DatasetEvaluators([])
+    if isinstance(evaluator, abc.MutableSequence):
+        evaluator = DatasetEvaluators(evaluator)
+    evaluator.reset()
+    num_warmup = min(5, total - 1)
+    start_time = time.perf_counter()
+    total_data_time = 0
+    total_compute_time = 0
+    total_eval_time = 0
+    with ExitStack() as stack:
+        if isinstance(model, nn.Module):
+            stack.enter_context(inference_context(model))
+        stack.enter_context(torch.no_grad())
+        start_data_time = time.perf_counter()
+        dict.get(callbacks or {}, "on_start", lambda: None)()
+        for idx, inputs in enumerate(data_loader):
+            total_data_time += time.perf_counter() - start_data_time
+            if idx == num_warmup:
+                start_time = time.perf_counter()
+                total_data_time = 0
+                total_compute_time = 0
+                total_eval_time = 0
+            start_compute_time = time.perf_counter()
+            dict.get(callbacks or {}, "before_inference", lambda: None)()
+            outputs = model(inputs)
+            dict.get(callbacks or {}, "after_inference", lambda: None)()
+            if torch.cuda.is_available():
+                torch.cuda.synchronize()
+            total_compute_time += time.perf_counter() - start_compute_time
+            start_eval_time = time.perf_counter()
+            evaluator.process(inputs, outputs)
+            total_eval_time += time.perf_counter() - start_eval_time
+            iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup)
+            data_seconds_per_iter = total_data_time / iters_after_start
+            compute_seconds_per_iter = total_compute_time / iters_after_start
+            eval_seconds_per_iter = total_eval_time / iters_after_start
+            total_seconds_per_iter = (time.perf_counter() - start_time) / iters_after_start
+            if idx >= num_warmup * 2 or compute_seconds_per_iter > 5:
+                eta = datetime.timedelta(seconds=int(total_seconds_per_iter * (total - idx - 1)))
+                log_every_n_seconds(
+                    logging.INFO,
+                    (
+                        f"Inference done {idx + 1}/{total}. "
+                        f"Dataloading: {data_seconds_per_iter:.4f} s/iter. "
+                        f"Inference: {compute_seconds_per_iter:.4f} s/iter. "
+                        f"Eval: {eval_seconds_per_iter:.4f} s/iter. "
+                        f"Total: {total_seconds_per_iter:.4f} s/iter. "
+                        f"ETA={eta}"
+                    ),
+                    n=5,
+                )
+            start_data_time = time.perf_counter()
+        dict.get(callbacks or {}, "on_end", lambda: None)()
+    # Measure the time only for this worker (before the synchronization barrier)
+    total_time = time.perf_counter() - start_time
+    total_time_str = str(datetime.timedelta(seconds=total_time))
+    # NOTE this format is parsed by grep
+    logger.info(
+        "Total inference time: {} ({:.6f} s / iter per device, on {} devices)".format(
+            total_time_str, total_time / (total - num_warmup), num_devices
+        )
+    )
+    total_compute_time_str = str(datetime.timedelta(seconds=int(total_compute_time)))
+    logger.info(
+        "Total inference pure compute time: {} ({:.6f} s / iter per device, on {} devices)".format(
+            total_compute_time_str, total_compute_time / (total - num_warmup), num_devices
+        )
+    )
+    results = evaluator.evaluate()
+    # An evaluator may return None when not in main process.
+    # Replace it by an empty dict instead to make it easier for downstream code to handle
+    if results is None:
+        results = {}
+    return results
+@contextmanager
+def inference_context(model):
+    """
+    A context where the model is temporarily changed to eval mode,
+    and restored to previous mode afterwards.
+    Args:
+        model: a torch Module
+    """
+    training_mode = model.training
+    model.eval()
+    yield
+    model.train(training_mode)
--- a/detectron2/evaluation/fast_eval_api.py
+++ b/detectron2/evaluation/fast_eval_api.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+import copy
+import logging
+import numpy as np
+import time
+from pycocotools.cocoeval import COCOeval
+from detectron2 import _C
+logger = logging.getLogger(__name__)
+class COCOeval_opt(COCOeval):
+    """
+    This is a slightly modified version of the original COCO API, where the functions evaluateImg()
+    and accumulate() are implemented in C++ to speedup evaluation
+    """
+    def evaluate(self):
+        """
+        Run per image evaluation on given images and store results in self.evalImgs_cpp, a
+        datastructure that isn't readable from Python but is used by a c++ implementation of
+        accumulate().  Unlike the original COCO PythonAPI, we don't populate the datastructure
+        self.evalImgs because this datastructure is a computational bottleneck.
+        :return: None
+        """
+        tic = time.time()
+        p = self.params
+        # add backward compatibility if useSegm is specified in params
+        if p.useSegm is not None:
+            p.iouType = "segm" if p.useSegm == 1 else "bbox"
+        logger.info("Evaluate annotation type *{}*".format(p.iouType))
+        p.imgIds = list(np.unique(p.imgIds))
+        if p.useCats:
+            p.catIds = list(np.unique(p.catIds))
+        p.maxDets = sorted(p.maxDets)
+        self.params = p
+        self._prepare()  # bottleneck
+        # loop through images, area range, max detection number
+        catIds = p.catIds if p.useCats else [-1]
+        if p.iouType == "segm" or p.iouType == "bbox":
+            computeIoU = self.computeIoU
+        elif p.iouType == "keypoints":
+            computeIoU = self.computeOks
+        self.ious = {
+            (imgId, catId): computeIoU(imgId, catId) for imgId in p.imgIds for catId in catIds
+        }  # bottleneck
+        maxDet = p.maxDets[-1]
+        # <<<< Beginning of code differences with original COCO API
+        def convert_instances_to_cpp(instances, is_det=False):
+            # Convert annotations for a list of instances in an image to a format that's fast
+            # to access in C++
+            instances_cpp = []
+            for instance in instances:
+                instance_cpp = _C.InstanceAnnotation(
+                    int(instance["id"]),
+                    instance["score"] if is_det else instance.get("score", 0.0),
+                    instance["area"],
+                    bool(instance.get("iscrowd", 0)),
+                    bool(instance.get("ignore", 0)),
+                )
+                instances_cpp.append(instance_cpp)
+            return instances_cpp
+        # Convert GT annotations, detections, and IOUs to a format that's fast to access in C++
+        ground_truth_instances = [
+            [convert_instances_to_cpp(self._gts[imgId, catId]) for catId in p.catIds]
+            for imgId in p.imgIds
+        ]
+        detected_instances = [
+            [convert_instances_to_cpp(self._dts[imgId, catId], is_det=True) for catId in p.catIds]
+            for imgId in p.imgIds
+        ]
+        ious = [[self.ious[imgId, catId] for catId in catIds] for imgId in p.imgIds]
+        if not p.useCats:
+            # For each image, flatten per-category lists into a single list
+            ground_truth_instances = [[[o for c in i for o in c]] for i in ground_truth_instances]
+            detected_instances = [[[o for c in i for o in c]] for i in detected_instances]
+        # Call C++ implementation of self.evaluateImgs()
+        self._evalImgs_cpp = _C.COCOevalEvaluateImages(
+            p.areaRng, maxDet, p.iouThrs, ious, ground_truth_instances, detected_instances
+        )
+        self._evalImgs = None
+        self._paramsEval = copy.deepcopy(self.params)
+        toc = time.time()
+        logger.info("COCOeval_opt.evaluate() finished in {:0.2f} seconds.".format(toc - tic))
+        # >>>> End of code differences with original COCO API
+    def accumulate(self):
+        """
+        Accumulate per image evaluation results and store the result in self.eval.  Does not
+        support changing parameter settings from those used by self.evaluate()
+        """
+        logger.info("Accumulating evaluation results...")
+        tic = time.time()
+        assert hasattr(
+            self, "_evalImgs_cpp"
+        ), "evaluate() must be called before accmulate() is called."
+        self.eval = _C.COCOevalAccumulate(self._paramsEval, self._evalImgs_cpp)
+        # recall is num_iou_thresholds X num_categories X num_area_ranges X num_max_detections
+        self.eval["recall"] = np.array(self.eval["recall"]).reshape(
+            self.eval["counts"][:1] + self.eval["counts"][2:]
+        )
+        # precision and scores are num_iou_thresholds X num_recall_thresholds X num_categories X
+        # num_area_ranges X num_max_detections
+        self.eval["precision"] = np.array(self.eval["precision"]).reshape(self.eval["counts"])
+        self.eval["scores"] = np.array(self.eval["scores"]).reshape(self.eval["counts"])
+        toc = time.time()
+        logger.info("COCOeval_opt.accumulate() finished in {:0.2f} seconds.".format(toc - tic))
--- a/detectron2/evaluation/lvis_evaluation.py
+++ b/detectron2/evaluation/lvis_evaluation.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+import copy
+import itertools
+import json
+import logging
+import os
+import pickle
+from collections import OrderedDict
+import torch
+import detectron2.utils.comm as comm
+from detectron2.config import CfgNode
+from detectron2.data import MetadataCatalog
+from detectron2.structures import Boxes, BoxMode, pairwise_iou
+from detectron2.utils.file_io import PathManager
+from detectron2.utils.logger import create_small_table
+from .coco_evaluation import instances_to_coco_json
+from .evaluator import DatasetEvaluator
+class LVISEvaluator(DatasetEvaluator):
+    """
+    Evaluate object proposal and instance detection/segmentation outputs using
+    LVIS's metrics and evaluation API.
+    """
+    def __init__(
+        self,
+        dataset_name,
+        tasks=None,
+        distributed=True,
+        output_dir=None,
+        *,
+        max_dets_per_image=None,
+    ):
+        """
+        Args:
+            dataset_name (str): name of the dataset to be evaluated.
+                It must have the following corresponding metadata:
+                "json_file": the path to the LVIS format annotation
+            tasks (tuple[str]): tasks that can be evaluated under the given
+                configuration. A task is one of "bbox", "segm".
+                By default, will infer this automatically from predictions.
+            distributed (True): if True, will collect results from all ranks for evaluation.
+                Otherwise, will evaluate the results in the current process.
+            output_dir (str): optional, an output directory to dump results.
+            max_dets_per_image (None or int): limit on maximum detections per image in evaluating AP
+                This limit, by default of the LVIS dataset, is 300.
+        """
+        from lvis import LVIS
+        self._logger = logging.getLogger(__name__)
+        if tasks is not None and isinstance(tasks, CfgNode):
+            self._logger.warn(
+                "COCO Evaluator instantiated using config, this is deprecated behavior."
+                " Please pass in explicit arguments instead."
+            )
+            self._tasks = None  # Infering it from predictions should be better
+        else:
+            self._tasks = tasks
+        self._distributed = distributed
+        self._output_dir = output_dir
+        self._max_dets_per_image = max_dets_per_image
+        self._cpu_device = torch.device("cpu")
+        self._metadata = MetadataCatalog.get(dataset_name)
+        json_file = PathManager.get_local_path(self._metadata.json_file)
+        self._lvis_api = LVIS(json_file)
+        # Test set json files do not contain annotations (evaluation must be
+        # performed using the LVIS evaluation server).
+        self._do_evaluation = len(self._lvis_api.get_ann_ids()) > 0
+    def reset(self):
+        self._predictions = []
+    def process(self, inputs, outputs):
+        """
+        Args:
+            inputs: the inputs to a LVIS model (e.g., GeneralizedRCNN).
+                It is a list of dict. Each dict corresponds to an image and
+                contains keys like "height", "width", "file_name", "image_id".
+            outputs: the outputs of a LVIS model. It is a list of dicts with key
+                "instances" that contains :class:`Instances`.
+        """
+        for input, output in zip(inputs, outputs):
+            prediction = {"image_id": input["image_id"]}
+            if "instances" in output:
+                instances = output["instances"].to(self._cpu_device)
+                prediction["instances"] = instances_to_coco_json(instances, input["image_id"])
+            if "proposals" in output:
+                prediction["proposals"] = output["proposals"].to(self._cpu_device)
+            self._predictions.append(prediction)
+    def evaluate(self):
+        if self._distributed:
+            comm.synchronize()
+            predictions = comm.gather(self._predictions, dst=0)
+            predictions = list(itertools.chain(*predictions))
+            if not comm.is_main_process():
+                return
+        else:
+            predictions = self._predictions
+        if len(predictions) == 0:
+            self._logger.warning("[LVISEvaluator] Did not receive valid predictions.")
+            return {}
+        if self._output_dir:
+            PathManager.mkdirs(self._output_dir)
+            file_path = os.path.join(self._output_dir, "instances_predictions.pth")
+            with PathManager.open(file_path, "wb") as f:
+                torch.save(predictions, f)
+        self._results = OrderedDict()
+        if "proposals" in predictions[0]:
+            self._eval_box_proposals(predictions)
+        if "instances" in predictions[0]:
+            self._eval_predictions(predictions)
+        # Copy so the caller can do whatever with results
+        return copy.deepcopy(self._results)
+    def _tasks_from_predictions(self, predictions):
+        for pred in predictions:
+            if "segmentation" in pred:
+                return ("bbox", "segm")
+        return ("bbox",)
+    def _eval_predictions(self, predictions):
+        """
+        Evaluate predictions. Fill self._results with the metrics of the tasks.
+        Args:
+            predictions (list[dict]): list of outputs from the model
+        """
+        self._logger.info("Preparing results in the LVIS format ...")
+        lvis_results = list(itertools.chain(*[x["instances"] for x in predictions]))
+        tasks = self._tasks or self._tasks_from_predictions(lvis_results)
+        # LVIS evaluator can be used to evaluate results for COCO dataset categories.
+        # In this case `_metadata` variable will have a field with COCO-specific category mapping.
+        if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"):
+            reverse_id_mapping = {
+                v: k for k, v in self._metadata.thing_dataset_id_to_contiguous_id.items()
+            }
+            for result in lvis_results:
+                result["category_id"] = reverse_id_mapping[result["category_id"]]
+        else:
+            # unmap the category ids for LVIS (from 0-indexed to 1-indexed)
+            for result in lvis_results:
+                result["category_id"] += 1
+        if self._output_dir:
+            file_path = os.path.join(self._output_dir, "lvis_instances_results.json")
+            self._logger.info("Saving results to {}".format(file_path))
+            with PathManager.open(file_path, "w") as f:
+                f.write(json.dumps(lvis_results))
+                f.flush()
+        if not self._do_evaluation:
+            self._logger.info("Annotations are not available for evaluation.")
+            return
+        self._logger.info("Evaluating predictions ...")
+        for task in sorted(tasks):
+            res = _evaluate_predictions_on_lvis(
+                self._lvis_api,
+                lvis_results,
+                task,
+                max_dets_per_image=self._max_dets_per_image,
+                class_names=self._metadata.get("thing_classes"),
+            )
+            self._results[task] = res
+    def _eval_box_proposals(self, predictions):
+        """
+        Evaluate the box proposals in predictions.
+        Fill self._results with the metrics for "box_proposals" task.
+        """
+        if self._output_dir:
+            # Saving generated box proposals to file.
+            # Predicted box_proposals are in XYXY_ABS mode.
+            bbox_mode = BoxMode.XYXY_ABS.value
+            ids, boxes, objectness_logits = [], [], []
+            for prediction in predictions:
+                ids.append(prediction["image_id"])
+                boxes.append(prediction["proposals"].proposal_boxes.tensor.numpy())
+                objectness_logits.append(prediction["proposals"].objectness_logits.numpy())
+            proposal_data = {
+                "boxes": boxes,
+                "objectness_logits": objectness_logits,
+                "ids": ids,
+                "bbox_mode": bbox_mode,
+            }
+            with PathManager.open(os.path.join(self._output_dir, "box_proposals.pkl"), "wb") as f:
+                pickle.dump(proposal_data, f)
+        if not self._do_evaluation:
+            self._logger.info("Annotations are not available for evaluation.")
+            return
+        self._logger.info("Evaluating bbox proposals ...")
+        res = {}
+        areas = {"all": "", "small": "s", "medium": "m", "large": "l"}
+        for limit in [100, 1000]:
+            for area, suffix in areas.items():
+                stats = _evaluate_box_proposals(predictions, self._lvis_api, area=area, limit=limit)
+                key = "AR{}@{:d}".format(suffix, limit)
+                res[key] = float(stats["ar"].item() * 100)
+        self._logger.info("Proposal metrics: \n" + create_small_table(res))
+        self._results["box_proposals"] = res
+# inspired from Detectron:
+# https://github.com/facebookresearch/Detectron/blob/a6a835f5b8208c45d0dce217ce9bbda915f44df7/detectron/datasets/json_dataset_evaluator.py#L255 # noqa
+def _evaluate_box_proposals(dataset_predictions, lvis_api, thresholds=None, area="all", limit=None):
+    """
+    Evaluate detection proposal recall metrics. This function is a much
+    faster alternative to the official LVIS API recall evaluation code. However,
+    it produces slightly different results.
+    """
+    # Record max overlap value for each gt box
+    # Return vector of overlap values
+    areas = {
+        "all": 0,
+        "small": 1,
+        "medium": 2,
+        "large": 3,
+        "96-128": 4,
+        "128-256": 5,
+        "256-512": 6,
+        "512-inf": 7,
+    }
+    area_ranges = [
+        [0**2, 1e5**2],  # all
+        [0**2, 32**2],  # small
+        [32**2, 96**2],  # medium
+        [96**2, 1e5**2],  # large
+        [96**2, 128**2],  # 96-128
+        [128**2, 256**2],  # 128-256
+        [256**2, 512**2],  # 256-512
+        [512**2, 1e5**2],
+    ]  # 512-inf
+    assert area in areas, "Unknown area range: {}".format(area)
+    area_range = area_ranges[areas[area]]
+    gt_overlaps = []
+    num_pos = 0
+    for prediction_dict in dataset_predictions:
+        predictions = prediction_dict["proposals"]
+        # sort predictions in descending order
+        # TODO maybe remove this and make it explicit in the documentation
+        inds = predictions.objectness_logits.sort(descending=True)[1]
+        predictions = predictions[inds]
+        ann_ids = lvis_api.get_ann_ids(img_ids=[prediction_dict["image_id"]])
+        anno = lvis_api.load_anns(ann_ids)
+        gt_boxes = [
+            BoxMode.convert(obj["bbox"], BoxMode.XYWH_ABS, BoxMode.XYXY_ABS) for obj in anno
+        ]
+        gt_boxes = torch.as_tensor(gt_boxes).reshape(-1, 4)  # guard against no boxes
+        gt_boxes = Boxes(gt_boxes)
+        gt_areas = torch.as_tensor([obj["area"] for obj in anno])
+        if len(gt_boxes) == 0 or len(predictions) == 0:
+            continue
+        valid_gt_inds = (gt_areas >= area_range[0]) & (gt_areas <= area_range[1])
+        gt_boxes = gt_boxes[valid_gt_inds]
+        num_pos += len(gt_boxes)
+        if len(gt_boxes) == 0:
+            continue
+        if limit is not None and len(predictions) > limit:
+            predictions = predictions[:limit]
+        overlaps = pairwise_iou(predictions.proposal_boxes, gt_boxes)
+        _gt_overlaps = torch.zeros(len(gt_boxes))
+        for j in range(min(len(predictions), len(gt_boxes))):
+            # find which proposal box maximally covers each gt box
+            # and get the iou amount of coverage for each gt box
+            max_overlaps, argmax_overlaps = overlaps.max(dim=0)
+            # find which gt box is 'best' covered (i.e. 'best' = most iou)
+            gt_ovr, gt_ind = max_overlaps.max(dim=0)
+            assert gt_ovr >= 0
+            # find the proposal box that covers the best covered gt box
+            box_ind = argmax_overlaps[gt_ind]
+            # record the iou coverage of this gt box
+            _gt_overlaps[j] = overlaps[box_ind, gt_ind]
+            assert _gt_overlaps[j] == gt_ovr
+            # mark the proposal box and the gt box as used
+            overlaps[box_ind, :] = -1
+            overlaps[:, gt_ind] = -1
+        # append recorded iou coverage level
+        gt_overlaps.append(_gt_overlaps)
+    gt_overlaps = (
+        torch.cat(gt_overlaps, dim=0) if len(gt_overlaps) else torch.zeros(0, dtype=torch.float32)
+    )
+    gt_overlaps, _ = torch.sort(gt_overlaps)
+    if thresholds is None:
+        step = 0.05
+        thresholds = torch.arange(0.5, 0.95 + 1e-5, step, dtype=torch.float32)
+    recalls = torch.zeros_like(thresholds)
+    # compute recall for each iou threshold
+    for i, t in enumerate(thresholds):
+        recalls[i] = (gt_overlaps >= t).float().sum() / float(num_pos)
+    # ar = 2 * np.trapz(recalls, thresholds)
+    ar = recalls.mean()
+    return {
+        "ar": ar,
+        "recalls": recalls,
+        "thresholds": thresholds,
+        "gt_overlaps": gt_overlaps,
+        "num_pos": num_pos,
+    }
+def _evaluate_predictions_on_lvis(
+    lvis_gt, lvis_results, iou_type, max_dets_per_image=None, class_names=None
+):
+    """
+    Args:
+        iou_type (str):
+        max_dets_per_image (None or int): limit on maximum detections per image in evaluating AP
+            This limit, by default of the LVIS dataset, is 300.
+        class_names (None or list[str]): if provided, will use it to predict
+            per-category AP.
+    Returns:
+        a dict of {metric name: score}
+    """
+    metrics = {
+        "bbox": ["AP", "AP50", "AP75", "APs", "APm", "APl", "APr", "APc", "APf"],
+        "segm": ["AP", "AP50", "AP75", "APs", "APm", "APl", "APr", "APc", "APf"],
+    }[iou_type]
+    logger = logging.getLogger(__name__)
+    if len(lvis_results) == 0:  # TODO: check if needed
+        logger.warn("No predictions from the model!")
+        return {metric: float("nan") for metric in metrics}
+    if iou_type == "segm":
+        lvis_results = copy.deepcopy(lvis_results)
+        # When evaluating mask AP, if the results contain bbox, LVIS API will
+        # use the box area as the area of the instance, instead of the mask area.
+        # This leads to a different definition of small/medium/large.
+        # We remove the bbox field to let mask AP use mask area.
+        for c in lvis_results:
+            c.pop("bbox", None)
+    if max_dets_per_image is None:
+        max_dets_per_image = 300  # Default for LVIS dataset
+    from lvis import LVISEval, LVISResults
+    logger.info(f"Evaluating with max detections per image = {max_dets_per_image}")
+    lvis_results = LVISResults(lvis_gt, lvis_results, max_dets=max_dets_per_image)
+    lvis_eval = LVISEval(lvis_gt, lvis_results, iou_type)
+    lvis_eval.run()
+    lvis_eval.print_results()
+    # Pull the standard metrics from the LVIS results
+    results = lvis_eval.get_results()
+    results = {metric: float(results[metric] * 100) for metric in metrics}
+    logger.info("Evaluation results for {}: \n".format(iou_type) + create_small_table(results))
+    return results
--- a/detectron2/evaluation/panoptic_evaluation.py
+++ b/detectron2/evaluation/panoptic_evaluation.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+import contextlib
+import io
+import itertools
+import json
+import logging
+import numpy as np
+import os
+import tempfile
+from collections import OrderedDict
+from typing import Optional
+from PIL import Image
+from tabulate import tabulate
+from detectron2.data import MetadataCatalog
+from detectron2.utils import comm
+from detectron2.utils.file_io import PathManager
+from .evaluator import DatasetEvaluator
+logger = logging.getLogger(__name__)
+class COCOPanopticEvaluator(DatasetEvaluator):
+    """
+    Evaluate Panoptic Quality metrics on COCO using PanopticAPI.
+    It saves panoptic segmentation prediction in `output_dir`
+    It contains a synchronize call and has to be called from all workers.
+    """
+    def __init__(self, dataset_name: str, output_dir: Optional[str] = None):
+        """
+        Args:
+            dataset_name: name of the dataset
+            output_dir: output directory to save results for evaluation.
+        """
+        self._metadata = MetadataCatalog.get(dataset_name)
+        self._thing_contiguous_id_to_dataset_id = {
+            v: k for k, v in self._metadata.thing_dataset_id_to_contiguous_id.items()
+        }
+        self._stuff_contiguous_id_to_dataset_id = {
+            v: k for k, v in self._metadata.stuff_dataset_id_to_contiguous_id.items()
+        }
+        self._output_dir = output_dir
+        if self._output_dir is not None:
+            PathManager.mkdirs(self._output_dir)
+    def reset(self):
+        self._predictions = []
+    def _convert_category_id(self, segment_info):
+        isthing = segment_info.pop("isthing", None)
+        if isthing is None:
+            # the model produces panoptic category id directly. No more conversion needed
+            return segment_info
+        if isthing is True:
+            segment_info["category_id"] = self._thing_contiguous_id_to_dataset_id[
+                segment_info["category_id"]
+            ]
+        else:
+            segment_info["category_id"] = self._stuff_contiguous_id_to_dataset_id[
+                segment_info["category_id"]
+            ]
+        return segment_info
+    def process(self, inputs, outputs):
+        from panopticapi.utils import id2rgb
+        for input, output in zip(inputs, outputs):
+            panoptic_img, segments_info = output["panoptic_seg"]
+            panoptic_img = panoptic_img.cpu().numpy()
+            if segments_info is None:
+                # If "segments_info" is None, we assume "panoptic_img" is a
+                # H*W int32 image storing the panoptic_id in the format of
+                # category_id * label_divisor + instance_id. We reserve -1 for
+                # VOID label, and add 1 to panoptic_img since the official
+                # evaluation script uses 0 for VOID label.
+                label_divisor = self._metadata.label_divisor
+                segments_info = []
+                for panoptic_label in np.unique(panoptic_img):
+                    if panoptic_label == -1:
+                        # VOID region.
+                        continue
+                    pred_class = panoptic_label // label_divisor
+                    isthing = (
+                        pred_class in self._metadata.thing_dataset_id_to_contiguous_id.values()
+                    )
+                    segments_info.append(
+                        {
+                            "id": int(panoptic_label) + 1,
+                            "category_id": int(pred_class),
+                            "isthing": bool(isthing),
+                        }
+                    )
+                # Official evaluation script uses 0 for VOID label.
+                panoptic_img += 1
+            file_name = os.path.basename(input["file_name"])
+            file_name_png = os.path.splitext(file_name)[0] + ".png"
+            with io.BytesIO() as out:
+                Image.fromarray(id2rgb(panoptic_img)).save(out, format="PNG")
+                segments_info = [self._convert_category_id(x) for x in segments_info]
+                self._predictions.append(
+                    {
+                        "image_id": input["image_id"],
+                        "file_name": file_name_png,
+                        "png_string": out.getvalue(),
+                        "segments_info": segments_info,
+                    }
+                )
+    def evaluate(self):
+        comm.synchronize()
+        self._predictions = comm.gather(self._predictions)
+        self._predictions = list(itertools.chain(*self._predictions))
+        if not comm.is_main_process():
+            return
+        # PanopticApi requires local files
+        gt_json = PathManager.get_local_path(self._metadata.panoptic_json)
+        gt_folder = PathManager.get_local_path(self._metadata.panoptic_root)
+        with tempfile.TemporaryDirectory(prefix="panoptic_eval") as pred_dir:
+            logger.info("Writing all panoptic predictions to {} ...".format(pred_dir))
+            for p in self._predictions:
+                with open(os.path.join(pred_dir, p["file_name"]), "wb") as f:
+                    f.write(p.pop("png_string"))
+            with open(gt_json, "r") as f:
+                json_data = json.load(f)
+            json_data["annotations"] = self._predictions
+            output_dir = self._output_dir or pred_dir
+            predictions_json = os.path.join(output_dir, "predictions.json")
+            with PathManager.open(predictions_json, "w") as f:
+                f.write(json.dumps(json_data))
+            from panopticapi.evaluation import pq_compute
+            with contextlib.redirect_stdout(io.StringIO()):
+                pq_res = pq_compute(
+                    gt_json,
+                    PathManager.get_local_path(predictions_json),
+                    gt_folder=gt_folder,
+                    pred_folder=pred_dir,
+                )
+        res = {}
+        res["PQ"] = 100 * pq_res["All"]["pq"]
+        res["SQ"] = 100 * pq_res["All"]["sq"]
+        res["RQ"] = 100 * pq_res["All"]["rq"]
+        res["PQ_th"] = 100 * pq_res["Things"]["pq"]
+        res["SQ_th"] = 100 * pq_res["Things"]["sq"]
+        res["RQ_th"] = 100 * pq_res["Things"]["rq"]
+        res["PQ_st"] = 100 * pq_res["Stuff"]["pq"]
+        res["SQ_st"] = 100 * pq_res["Stuff"]["sq"]
+        res["RQ_st"] = 100 * pq_res["Stuff"]["rq"]
+        results = OrderedDict({"panoptic_seg": res})
+        _print_panoptic_results(pq_res)
+        return results
+def _print_panoptic_results(pq_res):
+    headers = ["", "PQ", "SQ", "RQ", "#categories"]
+    data = []
+    for name in ["All", "Things", "Stuff"]:
+        row = [name] + [pq_res[name][k] * 100 for k in ["pq", "sq", "rq"]] + [pq_res[name]["n"]]
+        data.append(row)
+    table = tabulate(
+        data, headers=headers, tablefmt="pipe", floatfmt=".3f", stralign="center", numalign="center"
+    )
+    logger.info("Panoptic Evaluation Results:\n" + table)
+if __name__ == "__main__":
+    from detectron2.utils.logger import setup_logger
+    logger = setup_logger()
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--gt-json")
+    parser.add_argument("--gt-dir")
+    parser.add_argument("--pred-json")
+    parser.add_argument("--pred-dir")
+    args = parser.parse_args()
+    from panopticapi.evaluation import pq_compute
+    with contextlib.redirect_stdout(io.StringIO()):
+        pq_res = pq_compute(
+            args.gt_json, args.pred_json, gt_folder=args.gt_dir, pred_folder=args.pred_dir
+        )
+        _print_panoptic_results(pq_res)
--- a/detectron2/evaluation/pascal_voc_evaluation.py
+++ b/detectron2/evaluation/pascal_voc_evaluation.py
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+import logging
+import numpy as np
+import os
+import tempfile
+import xml.etree.ElementTree as ET
+from collections import OrderedDict, defaultdict
+from functools import lru_cache
+import torch
+from detectron2.data import MetadataCatalog
+from detectron2.utils import comm
+from detectron2.utils.file_io import PathManager
+from .evaluator import DatasetEvaluator
+class PascalVOCDetectionEvaluator(DatasetEvaluator):
+    """
+    Evaluate Pascal VOC style AP for Pascal VOC dataset.
+    It contains a synchronization, therefore has to be called from all ranks.
+    Note that the concept of AP can be implemented in different ways and may not
+    produce identical results. This class mimics the implementation of the official
+    Pascal VOC Matlab API, and should produce similar but not identical results to the
+    official API.
+    """
+    def __init__(self, dataset_name):
+        """
+        Args:
+            dataset_name (str): name of the dataset, e.g., "voc_2007_test"
+        """
+        self._dataset_name = dataset_name
+        meta = MetadataCatalog.get(dataset_name)
+        # Too many tiny files, download all to local for speed.
+        annotation_dir_local = PathManager.get_local_path(
+            os.path.join(meta.dirname, "Annotations/")
+        )
+        self._anno_file_template = os.path.join(annotation_dir_local, "{}.xml")
+        self._image_set_path = os.path.join(meta.dirname, "ImageSets", "Main", meta.split + ".txt")
+        self._class_names = meta.thing_classes
+        assert meta.year in [2007, 2012], meta.year
+        self._is_2007 = meta.year == 2007
+        self._cpu_device = torch.device("cpu")
+        self._logger = logging.getLogger(__name__)
+    def reset(self):
+        self._predictions = defaultdict(list)  # class name -> list of prediction strings
+    def process(self, inputs, outputs):
+        for input, output in zip(inputs, outputs):
+            image_id = input["image_id"]
+            instances = output["instances"].to(self._cpu_device)
+            boxes = instances.pred_boxes.tensor.numpy()
+            scores = instances.scores.tolist()
+            classes = instances.pred_classes.tolist()
+            for box, score, cls in zip(boxes, scores, classes):
+                xmin, ymin, xmax, ymax = box
+                # The inverse of data loading logic in `datasets/pascal_voc.py`
+                xmin += 1
+                ymin += 1
+                self._predictions[cls].append(
+                    f"{image_id} {score:.3f} {xmin:.1f} {ymin:.1f} {xmax:.1f} {ymax:.1f}"
+                )
+    def evaluate(self):
+        """
+        Returns:
+            dict: has a key "segm", whose value is a dict of "AP", "AP50", and "AP75".
+        """
+        all_predictions = comm.gather(self._predictions, dst=0)
+        if not comm.is_main_process():
+            return
+        predictions = defaultdict(list)
+        for predictions_per_rank in all_predictions:
+            for clsid, lines in predictions_per_rank.items():
+                predictions[clsid].extend(lines)
+        del all_predictions
+        self._logger.info(
+            "Evaluating {} using {} metric. "
+            "Note that results do not use the official Matlab API.".format(
+                self._dataset_name, 2007 if self._is_2007 else 2012
+            )
+        )
+        with tempfile.TemporaryDirectory(prefix="pascal_voc_eval_") as dirname:
+            res_file_template = os.path.join(dirname, "{}.txt")
+            aps = defaultdict(list)  # iou -> ap per class
+            for cls_id, cls_name in enumerate(self._class_names):
+                lines = predictions.get(cls_id, [""])
+                with open(res_file_template.format(cls_name), "w") as f:
+                    f.write("\n".join(lines))
+                for thresh in range(50, 100, 5):
+                    rec, prec, ap = voc_eval(
+                        res_file_template,
+                        self._anno_file_template,
+                        self._image_set_path,
+                        cls_name,
+                        ovthresh=thresh / 100.0,
+                        use_07_metric=self._is_2007,
+                    )
+                    aps[thresh].append(ap * 100)
+        ret = OrderedDict()
+        mAP = {iou: np.mean(x) for iou, x in aps.items()}
+        ret["bbox"] = {"AP": np.mean(list(mAP.values())), "AP50": mAP[50], "AP75": mAP[75]}
+        return ret
+##############################################################################
+#
+# Below code is modified from
+# https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/datasets/voc_eval.py
+# --------------------------------------------------------
+# Fast/er R-CNN
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Bharath Hariharan
+# --------------------------------------------------------
+"""Python implementation of the PASCAL VOC devkit's AP evaluation code."""
+@lru_cache(maxsize=None)
+def parse_rec(filename):
+    """Parse a PASCAL VOC xml file."""
+    with PathManager.open(filename) as f:
+        tree = ET.parse(f)
+    objects = []
+    for obj in tree.findall("object"):
+        obj_struct = {}
+        obj_struct["name"] = obj.find("name").text
+        obj_struct["pose"] = obj.find("pose").text
+        obj_struct["truncated"] = int(obj.find("truncated").text)
+        obj_struct["difficult"] = int(obj.find("difficult").text)
+        bbox = obj.find("bndbox")
+        obj_struct["bbox"] = [
+            int(bbox.find("xmin").text),
+            int(bbox.find("ymin").text),
+            int(bbox.find("xmax").text),
+            int(bbox.find("ymax").text),
+        ]
+        objects.append(obj_struct)
+    return objects
+def voc_ap(rec, prec, use_07_metric=False):
+    """Compute VOC AP given precision and recall. If use_07_metric is true, uses
+    the VOC 07 11-point method (default:False).
+    """
+    if use_07_metric:
+        # 11 point metric
+        ap = 0.0
+        for t in np.arange(0.0, 1.1, 0.1):
+            if np.sum(rec >= t) == 0:
+                p = 0
+            else:
+                p = np.max(prec[rec >= t])
+            ap = ap + p / 11.0
+    else:
+        # correct AP calculation
+        # first append sentinel values at the end
+        mrec = np.concatenate(([0.0], rec, [1.0]))
+        mpre = np.concatenate(([0.0], prec, [0.0]))
+        # compute the precision envelope
+        for i in range(mpre.size - 1, 0, -1):
+            mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
+        # to calculate area under PR curve, look for points
+        # where X axis (recall) changes value
+        i = np.where(mrec[1:] != mrec[:-1])[0]
+        # and sum (\Delta recall) * prec
+        ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
+    return ap
+def voc_eval(detpath, annopath, imagesetfile, classname, ovthresh=0.5, use_07_metric=False):
+    """rec, prec, ap = voc_eval(detpath,
+                                annopath,
+                                imagesetfile,
+                                classname,
+                                [ovthresh],
+                                [use_07_metric])
+    Top level function that does the PASCAL VOC evaluation.
+    detpath: Path to detections
+        detpath.format(classname) should produce the detection results file.
+    annopath: Path to annotations
+        annopath.format(imagename) should be the xml annotations file.
+    imagesetfile: Text file containing the list of images, one image per line.
+    classname: Category name (duh)
+    [ovthresh]: Overlap threshold (default = 0.5)
+    [use_07_metric]: Whether to use VOC07's 11 point AP computation
+        (default False)
+    """
+    # assumes detections are in detpath.format(classname)
+    # assumes annotations are in annopath.format(imagename)
+    # assumes imagesetfile is a text file with each line an image name
+    # first load gt
+    # read list of images
+    with PathManager.open(imagesetfile, "r") as f:
+        lines = f.readlines()
+    imagenames = [x.strip() for x in lines]
+    # load annots
+    recs = {}
+    for imagename in imagenames:
+        recs[imagename] = parse_rec(annopath.format(imagename))
+    # extract gt objects for this class
+    class_recs = {}
+    npos = 0
+    for imagename in imagenames:
+        R = [obj for obj in recs[imagename] if obj["name"] == classname]
+        bbox = np.array([x["bbox"] for x in R])
+        difficult = np.array([x["difficult"] for x in R]).astype(bool)
+        # difficult = np.array([False for x in R]).astype(bool)  # treat all "difficult" as GT
+        det = [False] * len(R)
+        npos = npos + sum(~difficult)
+        class_recs[imagename] = {"bbox": bbox, "difficult": difficult, "det": det}
+    # read dets
+    detfile = detpath.format(classname)
+    with open(detfile, "r") as f:
+        lines = f.readlines()
+    splitlines = [x.strip().split(" ") for x in lines]
+    image_ids = [x[0] for x in splitlines]
+    confidence = np.array([float(x[1]) for x in splitlines])
+    BB = np.array([[float(z) for z in x[2:]] for x in splitlines]).reshape(-1, 4)
+    # sort by confidence
+    sorted_ind = np.argsort(-confidence)
+    BB = BB[sorted_ind, :]
+    image_ids = [image_ids[x] for x in sorted_ind]
+    # go down dets and mark TPs and FPs
+    nd = len(image_ids)
+    tp = np.zeros(nd)
+    fp = np.zeros(nd)
+    for d in range(nd):
+        R = class_recs[image_ids[d]]
+        bb = BB[d, :].astype(float)
+        ovmax = -np.inf
+        BBGT = R["bbox"].astype(float)
+        if BBGT.size > 0:
+            # compute overlaps
+            # intersection
+            ixmin = np.maximum(BBGT[:, 0], bb[0])
+            iymin = np.maximum(BBGT[:, 1], bb[1])
+            ixmax = np.minimum(BBGT[:, 2], bb[2])
+            iymax = np.minimum(BBGT[:, 3], bb[3])
+            iw = np.maximum(ixmax - ixmin + 1.0, 0.0)
+            ih = np.maximum(iymax - iymin + 1.0, 0.0)
+            inters = iw * ih
+            # union
+            uni = (
+                (bb[2] - bb[0] + 1.0) * (bb[3] - bb[1] + 1.0)
+                + (BBGT[:, 2] - BBGT[:, 0] + 1.0) * (BBGT[:, 3] - BBGT[:, 1] + 1.0)
+                - inters
+            )
+            overlaps = inters / uni
+            ovmax = np.max(overlaps)
+            jmax = np.argmax(overlaps)
+        if ovmax > ovthresh:
+            if not R["difficult"][jmax]:
+                if not R["det"][jmax]:
+                    tp[d] = 1.0
+                    R["det"][jmax] = 1
+                else:
+                    fp[d] = 1.0
+        else:
+            fp[d] = 1.0
+    # compute precision recall
+    fp = np.cumsum(fp)
+    tp = np.cumsum(tp)
+    rec = tp / float(npos)
+    # avoid divide by zero in case the first detection matches a difficult
+    # ground truth
+    prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
+    ap = voc_ap(rec, prec, use_07_metric)
+    return rec, prec, ap
--- a/detectron2/evaluation/rotated_coco_evaluation.py
+++ b/detectron2/evaluation/rotated_coco_evaluation.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+import itertools
+import json
+import numpy as np
+import os
+import torch
+from pycocotools.cocoeval import COCOeval, maskUtils
+from detectron2.structures import BoxMode, RotatedBoxes, pairwise_iou_rotated
+from detectron2.utils.file_io import PathManager
+from .coco_evaluation import COCOEvaluator
+class RotatedCOCOeval(COCOeval):
+    @staticmethod
+    def is_rotated(box_list):
+        if type(box_list) is np.ndarray:
+            return box_list.shape[1] == 5
+        elif type(box_list) is list:
+            if box_list == []:  # cannot decide the box_dim
+                return False
+            return np.all(
+                np.array(
+                    [
+                        (len(obj) == 5) and ((type(obj) is list) or (type(obj) is np.ndarray))
+                        for obj in box_list
+                    ]
+                )
+            )
+        return False
+    @staticmethod
+    def boxlist_to_tensor(boxlist, output_box_dim):
+        if type(boxlist) is np.ndarray:
+            box_tensor = torch.from_numpy(boxlist)
+        elif type(boxlist) is list:
+            if boxlist == []:
+                return torch.zeros((0, output_box_dim), dtype=torch.float32)
+            else:
+                box_tensor = torch.FloatTensor(boxlist)
+        else:
+            raise Exception("Unrecognized boxlist type")
+        input_box_dim = box_tensor.shape[1]
+        if input_box_dim != output_box_dim:
+            if input_box_dim == 4 and output_box_dim == 5:
+                box_tensor = BoxMode.convert(box_tensor, BoxMode.XYWH_ABS, BoxMode.XYWHA_ABS)
+            else:
+                raise Exception(
+                    "Unable to convert from {}-dim box to {}-dim box".format(
+                        input_box_dim, output_box_dim
+                    )
+                )
+        return box_tensor
+    def compute_iou_dt_gt(self, dt, gt, is_crowd):
+        if self.is_rotated(dt) or self.is_rotated(gt):
+            # TODO: take is_crowd into consideration
+            assert all(c == 0 for c in is_crowd)
+            dt = RotatedBoxes(self.boxlist_to_tensor(dt, output_box_dim=5))
+            gt = RotatedBoxes(self.boxlist_to_tensor(gt, output_box_dim=5))
+            return pairwise_iou_rotated(dt, gt)
+        else:
+            # This is the same as the classical COCO evaluation
+            return maskUtils.iou(dt, gt, is_crowd)
+    def computeIoU(self, imgId: int, catId: int):
+        p = self.params
+        if p.useCats:
+            gt = self._gts[imgId, catId]
+            dt = self._dts[imgId, catId]
+        else:
+            gt = [_ for cId in p.catIds for _ in self._gts[imgId, cId]]
+            dt = [_ for cId in p.catIds for _ in self._dts[imgId, cId]]
+        if len(gt) == 0 or len(dt) == 0:
+            return []
+        inds = np.argsort([-d["score"] for d in dt], kind="mergesort")
+        dt = [dt[i] for i in inds]
+        if len(dt) > p.maxDets[-1]:
+            dt = dt[0 : p.maxDets[-1]]
+        assert p.iouType == "bbox", "unsupported iouType for iou computation"
+        g = [g["bbox"] for g in gt]
+        d = [d["bbox"] for d in dt]
+        # compute iou between each dt and gt region
+        iscrowd = [int(o["iscrowd"]) for o in gt]
+        # Note: this function is copied from cocoeval.py in cocoapi
+        # and the major difference is here.
+        ious = self.compute_iou_dt_gt(d, g, iscrowd)
+        return ious
+class RotatedCOCOEvaluator(COCOEvaluator):
+    """
+    Evaluate object proposal/instance detection outputs using COCO-like metrics and APIs,
+    with rotated boxes support.
+    Note: this uses IOU only and does not consider angle differences.
+    """
+    def process(self, inputs, outputs):
+        """
+        Args:
+            inputs: the inputs to a COCO model (e.g., GeneralizedRCNN).
+                It is a list of dict. Each dict corresponds to an image and
+                contains keys like "height", "width", "file_name", "image_id".
+            outputs: the outputs of a COCO model. It is a list of dicts with key
+                "instances" that contains :class:`Instances`.
+        """
+        for input, output in zip(inputs, outputs):
+            prediction = {"image_id": input["image_id"]}
+            if "instances" in output:
+                instances = output["instances"].to(self._cpu_device)
+                prediction["instances"] = self.instances_to_json(instances, input["image_id"])
+            if "proposals" in output:
+                prediction["proposals"] = output["proposals"].to(self._cpu_device)
+            self._predictions.append(prediction)
+    def instances_to_json(self, instances, img_id):
+        num_instance = len(instances)
+        if num_instance == 0:
+            return []
+        boxes = instances.pred_boxes.tensor.numpy()
+        if boxes.shape[1] == 4:
+            boxes = BoxMode.convert(boxes, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
+        boxes = boxes.tolist()
+        scores = instances.scores.tolist()
+        classes = instances.pred_classes.tolist()
+        results = []
+        for k in range(num_instance):
+            result = {
+                "image_id": img_id,
+                "category_id": classes[k],
+                "bbox": boxes[k],
+                "score": scores[k],
+            }
+            results.append(result)
+        return results
+    def _eval_predictions(self, predictions, img_ids=None):  # img_ids: unused
+        """
+        Evaluate predictions on the given tasks.
+        Fill self._results with the metrics of the tasks.
+        """
+        self._logger.info("Preparing results for COCO format ...")
+        coco_results = list(itertools.chain(*[x["instances"] for x in predictions]))
+        # unmap the category ids for COCO
+        if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"):
+            reverse_id_mapping = {
+                v: k for k, v in self._metadata.thing_dataset_id_to_contiguous_id.items()
+            }
+            for result in coco_results:
+                result["category_id"] = reverse_id_mapping[result["category_id"]]
+        if self._output_dir:
+            file_path = os.path.join(self._output_dir, "coco_instances_results.json")
+            self._logger.info("Saving results to {}".format(file_path))
+            with PathManager.open(file_path, "w") as f:
+                f.write(json.dumps(coco_results))
+                f.flush()
+        if not self._do_evaluation:
+            self._logger.info("Annotations are not available for evaluation.")
+            return
+        self._logger.info("Evaluating predictions ...")
+        assert self._tasks is None or set(self._tasks) == {
+            "bbox"
+        }, "[RotatedCOCOEvaluator] Only bbox evaluation is supported"
+        coco_eval = (
+            self._evaluate_predictions_on_coco(self._coco_api, coco_results)
+            if len(coco_results) > 0
+            else None  # cocoapi does not handle empty results very well
+        )
+        task = "bbox"
+        res = self._derive_coco_results(
+            coco_eval, task, class_names=self._metadata.get("thing_classes")
+        )
+        self._results[task] = res
+    def _evaluate_predictions_on_coco(self, coco_gt, coco_results):
+        """
+        Evaluate the coco results using COCOEval API.
+        """
+        assert len(coco_results) > 0
+        coco_dt = coco_gt.loadRes(coco_results)
+        # Only bbox is supported for now
+        coco_eval = RotatedCOCOeval(coco_gt, coco_dt, iouType="bbox")
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+        return coco_eval
--- a/detectron2/evaluation/sem_seg_evaluation.py
+++ b/detectron2/evaluation/sem_seg_evaluation.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+import itertools
+import json
+import logging
+import numpy as np
+import os
+from collections import OrderedDict
+from typing import Optional, Union
+import pycocotools.mask as mask_util
+import torch
+from PIL import Image
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from detectron2.utils.comm import all_gather, is_main_process, synchronize
+from detectron2.utils.file_io import PathManager
+from .evaluator import DatasetEvaluator
+_CV2_IMPORTED = True
+try:
+    import cv2  # noqa
+except ImportError:
+    # OpenCV is an optional dependency at the moment
+    _CV2_IMPORTED = False
+def load_image_into_numpy_array(
+    filename: str,
+    dtype: Optional[Union[np.dtype, str]] = None,
+) -> np.ndarray:
+    with PathManager.open(filename, "rb") as f:
+        array = np.asarray(Image.open(f), dtype=dtype)
+    return array
+class SemSegEvaluator(DatasetEvaluator):
+    """
+    Evaluate semantic segmentation metrics.
+    """
+    def __init__(
+        self,
+        dataset_name,
+        distributed=True,
+        output_dir=None,
+        *,
+        sem_seg_loading_fn=load_image_into_numpy_array,
+        num_classes=None,
+        ignore_label=None,
+    ):
+        """
+        Args:
+            dataset_name (str): name of the dataset to be evaluated.
+            distributed (bool): if True, will collect results from all ranks for evaluation.
+                Otherwise, will evaluate the results in the current process.
+            output_dir (str): an output directory to dump results.
+            sem_seg_loading_fn: function to read sem seg file and load into numpy array.
+                Default provided, but projects can customize.
+            num_classes, ignore_label: deprecated argument
+        """
+        self._logger = logging.getLogger(__name__)
+        if num_classes is not None:
+            self._logger.warn(
+                "SemSegEvaluator(num_classes) is deprecated! It should be obtained from metadata."
+            )
+        if ignore_label is not None:
+            self._logger.warn(
+                "SemSegEvaluator(ignore_label) is deprecated! It should be obtained from metadata."
+            )
+        self._dataset_name = dataset_name
+        self._distributed = distributed
+        self._output_dir = output_dir
+        self._cpu_device = torch.device("cpu")
+        self.input_file_to_gt_file = {
+            dataset_record["file_name"]: dataset_record["sem_seg_file_name"]
+            for dataset_record in DatasetCatalog.get(dataset_name)
+        }
+        meta = MetadataCatalog.get(dataset_name)
+        # Dict that maps contiguous training ids to COCO category ids
+        try:
+            c2d = meta.stuff_dataset_id_to_contiguous_id
+            self._contiguous_id_to_dataset_id = {v: k for k, v in c2d.items()}
+        except AttributeError:
+            self._contiguous_id_to_dataset_id = None
+        self._class_names = meta.stuff_classes
+        self.sem_seg_loading_fn = sem_seg_loading_fn
+        self._num_classes = len(meta.stuff_classes)
+        if num_classes is not None:
+            assert self._num_classes == num_classes, f"{self._num_classes} != {num_classes}"
+        self._ignore_label = ignore_label if ignore_label is not None else meta.ignore_label
+        # This is because cv2.erode did not work for int datatype. Only works for uint8.
+        self._compute_boundary_iou = True
+        if not _CV2_IMPORTED:
+            self._compute_boundary_iou = False
+            self._logger.warn(
+                """Boundary IoU calculation requires OpenCV. B-IoU metrics are
+                not going to be computed because OpenCV is not available to import."""
+            )
+        if self._num_classes >= np.iinfo(np.uint8).max:
+            self._compute_boundary_iou = False
+            self._logger.warn(
+                f"""SemSegEvaluator(num_classes) is more than supported value for Boundary IoU
+                calculation! B-IoU metrics are not going to be computed. Max allowed value
+                (exclusive) for num_classes for calculating Boundary IoU is.
+                {np.iinfo(np.uint8).max} The number of classes of dataset {self._dataset_name} is
+                {self._num_classes}"""
+            )
+    def reset(self):
+        self._conf_matrix = np.zeros((self._num_classes + 1, self._num_classes + 1), dtype=np.int64)
+        self._b_conf_matrix = np.zeros(
+            (self._num_classes + 1, self._num_classes + 1), dtype=np.int64
+        )
+        self._predictions = []
+    def process(self, inputs, outputs):
+        """
+        Args:
+            inputs: the inputs to a model.
+                It is a list of dicts. Each dict corresponds to an image and
+                contains keys like "height", "width", "file_name".
+            outputs: the outputs of a model. It is either list of semantic segmentation predictions
+                (Tensor [H, W]) or list of dicts with key "sem_seg" that contains semantic
+                segmentation prediction in the same format.
+        """
+        for input, output in zip(inputs, outputs):
+            output = output["sem_seg"].argmax(dim=0).to(self._cpu_device)
+            pred = np.array(output, dtype=int)
+            gt_filename = self.input_file_to_gt_file[input["file_name"]]
+            gt = self.sem_seg_loading_fn(gt_filename, dtype=int)
+            gt[gt == self._ignore_label] = self._num_classes
+            self._conf_matrix += np.bincount(
+                (self._num_classes + 1) * pred.reshape(-1) + gt.reshape(-1),
+                minlength=self._conf_matrix.size,
+            ).reshape(self._conf_matrix.shape)
+            if self._compute_boundary_iou:
+                b_gt = self._mask_to_boundary(gt.astype(np.uint8))
+                b_pred = self._mask_to_boundary(pred.astype(np.uint8))
+                self._b_conf_matrix += np.bincount(
+                    (self._num_classes + 1) * b_pred.reshape(-1) + b_gt.reshape(-1),
+                    minlength=self._conf_matrix.size,
+                ).reshape(self._conf_matrix.shape)
+            self._predictions.extend(self.encode_json_sem_seg(pred, input["file_name"]))
+    def evaluate(self):
+        """
+        Evaluates standard semantic segmentation metrics (http://cocodataset.org/#stuff-eval):
+        * Mean intersection-over-union averaged across classes (mIoU)
+        * Frequency Weighted IoU (fwIoU)
+        * Mean pixel accuracy averaged across classes (mACC)
+        * Pixel Accuracy (pACC)
+        """
+        if self._distributed:
+            synchronize()
+            conf_matrix_list = all_gather(self._conf_matrix)
+            b_conf_matrix_list = all_gather(self._b_conf_matrix)
+            self._predictions = all_gather(self._predictions)
+            self._predictions = list(itertools.chain(*self._predictions))
+            if not is_main_process():
+                return
+            self._conf_matrix = np.zeros_like(self._conf_matrix)
+            for conf_matrix in conf_matrix_list:
+                self._conf_matrix += conf_matrix
+            self._b_conf_matrix = np.zeros_like(self._b_conf_matrix)
+            for b_conf_matrix in b_conf_matrix_list:
+                self._b_conf_matrix += b_conf_matrix
+        if self._output_dir:
+            PathManager.mkdirs(self._output_dir)
+            file_path = os.path.join(self._output_dir, "sem_seg_predictions.json")
+            with PathManager.open(file_path, "w") as f:
+                f.write(json.dumps(self._predictions))
+        acc = np.full(self._num_classes, np.nan, dtype=float)
+        iou = np.full(self._num_classes, np.nan, dtype=float)
+        tp = self._conf_matrix.diagonal()[:-1].astype(float)
+        pos_gt = np.sum(self._conf_matrix[:-1, :-1], axis=0).astype(float)
+        class_weights = pos_gt / np.sum(pos_gt)
+        pos_pred = np.sum(self._conf_matrix[:-1, :-1], axis=1).astype(float)
+        acc_valid = pos_gt > 0
+        acc[acc_valid] = tp[acc_valid] / pos_gt[acc_valid]
+        union = pos_gt + pos_pred - tp
+        iou_valid = np.logical_and(acc_valid, union > 0)
+        iou[iou_valid] = tp[iou_valid] / union[iou_valid]
+        macc = np.sum(acc[acc_valid]) / np.sum(acc_valid)
+        miou = np.sum(iou[iou_valid]) / np.sum(iou_valid)
+        fiou = np.sum(iou[iou_valid] * class_weights[iou_valid])
+        pacc = np.sum(tp) / np.sum(pos_gt)
+        if self._compute_boundary_iou:
+            b_iou = np.full(self._num_classes, np.nan, dtype=float)
+            b_tp = self._b_conf_matrix.diagonal()[:-1].astype(float)
+            b_pos_gt = np.sum(self._b_conf_matrix[:-1, :-1], axis=0).astype(float)
+            b_pos_pred = np.sum(self._b_conf_matrix[:-1, :-1], axis=1).astype(float)
+            b_union = b_pos_gt + b_pos_pred - b_tp
+            b_iou_valid = b_union > 0
+            b_iou[b_iou_valid] = b_tp[b_iou_valid] / b_union[b_iou_valid]
+        res = {}
+        res["mIoU"] = 100 * miou
+        res["fwIoU"] = 100 * fiou
+        for i, name in enumerate(self._class_names):
+            res[f"IoU-{name}"] = 100 * iou[i]
+            if self._compute_boundary_iou:
+                res[f"BoundaryIoU-{name}"] = 100 * b_iou[i]
+                res[f"min(IoU, B-Iou)-{name}"] = 100 * min(iou[i], b_iou[i])
+        res["mACC"] = 100 * macc
+        res["pACC"] = 100 * pacc
+        for i, name in enumerate(self._class_names):
+            res[f"ACC-{name}"] = 100 * acc[i]
+        if self._output_dir:
+            file_path = os.path.join(self._output_dir, "sem_seg_evaluation.pth")
+            with PathManager.open(file_path, "wb") as f:
+                torch.save(res, f)
+        results = OrderedDict({"sem_seg": res})
+        self._logger.info(results)
+        return results
+    def encode_json_sem_seg(self, sem_seg, input_file_name):
+        """
+        Convert semantic segmentation to COCO stuff format with segments encoded as RLEs.
+        See http://cocodataset.org/#format-results
+        """
+        json_list = []
+        for label in np.unique(sem_seg):
+            if self._contiguous_id_to_dataset_id is not None:
+                assert (
+                    label in self._contiguous_id_to_dataset_id
+                ), "Label {} is not in the metadata info for {}".format(label, self._dataset_name)
+                dataset_id = self._contiguous_id_to_dataset_id[label]
+            else:
+                dataset_id = int(label)
+            mask = (sem_seg == label).astype(np.uint8)
+            mask_rle = mask_util.encode(np.array(mask[:, :, None], order="F"))[0]
+            mask_rle["counts"] = mask_rle["counts"].decode("utf-8")
+            json_list.append(
+                {"file_name": input_file_name, "category_id": dataset_id, "segmentation": mask_rle}
+            )
+        return json_list
+    def _mask_to_boundary(self, mask: np.ndarray, dilation_ratio=0.02):
+        assert mask.ndim == 2, "mask_to_boundary expects a 2-dimensional image"
+        h, w = mask.shape
+        diag_len = np.sqrt(h**2 + w**2)
+        dilation = max(1, int(round(dilation_ratio * diag_len)))
+        kernel = np.ones((3, 3), dtype=np.uint8)
+        padded_mask = cv2.copyMakeBorder(mask, 1, 1, 1, 1, cv2.BORDER_CONSTANT, value=0)
+        eroded_mask_with_padding = cv2.erode(padded_mask, kernel, iterations=dilation)
+        eroded_mask = eroded_mask_with_padding[1:-1, 1:-1]
+        boundary = mask - eroded_mask
+        return boundary
--- a/detectron2/evaluation/testing.py
+++ b/detectron2/evaluation/testing.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+import logging
+import numpy as np
+import pprint
+import sys
+from collections.abc import Mapping
+def print_csv_format(results):
+    """
+    Print main metrics in a format similar to Detectron,
+    so that they are easy to copypaste into a spreadsheet.
+    Args:
+        results (OrderedDict[dict]): task_name -> {metric -> score}
+            unordered dict can also be printed, but in arbitrary order
+    """
+    assert isinstance(results, Mapping) or not len(results), results
+    logger = logging.getLogger(__name__)
+    for task, res in results.items():
+        if isinstance(res, Mapping):
+            # Don't print "AP-category" metrics since they are usually not tracked.
+            important_res = [(k, v) for k, v in res.items() if "-" not in k]
+            logger.info("copypaste: Task: {}".format(task))
+            logger.info("copypaste: " + ",".join([k[0] for k in important_res]))
+            logger.info("copypaste: " + ",".join(["{0:.4f}".format(k[1]) for k in important_res]))
+        else:
+            logger.info(f"copypaste: {task}={res}")
+def verify_results(cfg, results):
+    """
+    Args:
+        results (OrderedDict[dict]): task_name -> {metric -> score}
+    Returns:
+        bool: whether the verification succeeds or not
+    """
+    expected_results = cfg.TEST.EXPECTED_RESULTS
+    if not len(expected_results):
+        return True
+    ok = True
+    for task, metric, expected, tolerance in expected_results:
+        actual = results[task].get(metric, None)
+        if actual is None:
+            ok = False
+            continue
+        if not np.isfinite(actual):
+            ok = False
+            continue
+        diff = abs(actual - expected)
+        if diff > tolerance:
+            ok = False
+    logger = logging.getLogger(__name__)
+    if not ok:
+        logger.error("Result verification failed!")
+        logger.error("Expected Results: " + str(expected_results))
+        logger.error("Actual Results: " + pprint.pformat(results))
+        sys.exit(1)
+    else:
+        logger.info("Results verification passed.")
+    return ok
+def flatten_results_dict(results):
+    """
+    Expand a hierarchical dict of scalars into a flat dict of scalars.
+    If results[k1][k2][k3] = v, the returned dict will have the entry
+    {"k1/k2/k3": v}.
+    Args:
+        results (dict):
+    """
+    r = {}
+    for k, v in results.items():
+        if isinstance(v, Mapping):
+            v = flatten_results_dict(v)
+            for kk, vv in v.items():
+                r[k + "/" + kk] = vv
+        else:
+            r[k] = v
+    return r
--- a/detectron2/export/README.md
+++ b/detectron2/export/README.md
+This directory contains code to prepare a detectron2 model for deployment.
+Currently it supports exporting a detectron2 model to TorchScript, ONNX, or (deprecated) Caffe2 format.
+Please see [documentation](https://detectron2.readthedocs.io/tutorials/deployment.html) for its usage.
+### Acknowledgements
+Thanks to Mobile Vision team at Facebook for developing the Caffe2 conversion tools.
+Thanks to Computing Platform Department - PAI team at Alibaba Group (@bddpqq, @chenbohua3) who
+help export Detectron2 models to TorchScript.
+Thanks to ONNX Converter team at Microsoft who help export Detectron2 models to ONNX.
--- a/detectron2/export/__init__.py
+++ b/detectron2/export/__init__.py
+# -*- coding: utf-8 -*-
+import warnings
+from .flatten import TracingAdapter
+from .torchscript import dump_torchscript_IR, scripting_with_instances
+try:
+    from caffe2.proto import caffe2_pb2 as _tmp
+    from caffe2.python import core
+    # caffe2 is optional
+except ImportError:
+    pass
+else:
+    from .api import *
+# TODO: Update ONNX Opset version and run tests when a newer PyTorch is supported
+STABLE_ONNX_OPSET_VERSION = 11
+def add_export_config(cfg):
+    warnings.warn(
+        "add_export_config has been deprecated and behaves as no-op function.", DeprecationWarning
+    )
+    return cfg
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
--- a/detectron2/export/api.py
+++ b/detectron2/export/api.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+import copy
+import logging
+import os
+import torch
+from caffe2.proto import caffe2_pb2
+from torch import nn
+from detectron2.config import CfgNode
+from detectron2.utils.file_io import PathManager
+from .caffe2_inference import ProtobufDetectionModel
+from .caffe2_modeling import META_ARCH_CAFFE2_EXPORT_TYPE_MAP, convert_batched_inputs_to_c2_format
+from .shared import get_pb_arg_vali, get_pb_arg_vals, save_graph
+__all__ = [
+    "Caffe2Model",
+    "Caffe2Tracer",
+]
+class Caffe2Tracer:
+    """
+    Make a detectron2 model traceable with Caffe2 operators.
+    This class creates a traceable version of a detectron2 model which:
+    1. Rewrite parts of the model using ops in Caffe2. Note that some ops do
+       not have GPU implementation in Caffe2.
+    2. Remove post-processing and only produce raw layer outputs
+    After making a traceable model, the class provide methods to export such a
+    model to different deployment formats.
+    Exported graph produced by this class take two input tensors:
+    1. (1, C, H, W) float "data" which is an image (usually in [0, 255]).
+       (H, W) often has to be padded to multiple of 32 (depend on the model
+       architecture).
+    2. 1x3 float "im_info", each row of which is (height, width, 1.0).
+       Height and width are true image shapes before padding.
+    The class currently only supports models using builtin meta architectures.
+    Batch inference is not supported, and contributions are welcome.
+    """
+    def __init__(self, cfg: CfgNode, model: nn.Module, inputs):
+        """
+        Args:
+            cfg (CfgNode): a detectron2 config used to construct caffe2-compatible model.
+            model (nn.Module): An original pytorch model. Must be among a few official models
+                in detectron2 that can be converted to become caffe2-compatible automatically.
+                Weights have to be already loaded to this model.
+            inputs: sample inputs that the given model takes for inference.
+                Will be used to trace the model. For most models, random inputs with
+                no detected objects will not work as they lead to wrong traces.
+        """
+        assert isinstance(cfg, CfgNode), cfg
+        assert isinstance(model, torch.nn.Module), type(model)
+        # TODO make it support custom models, by passing in c2 model directly
+        C2MetaArch = META_ARCH_CAFFE2_EXPORT_TYPE_MAP[cfg.MODEL.META_ARCHITECTURE]
+        self.traceable_model = C2MetaArch(cfg, copy.deepcopy(model))
+        self.inputs = inputs
+        self.traceable_inputs = self.traceable_model.get_caffe2_inputs(inputs)
+    def export_caffe2(self):
+        """
+        Export the model to Caffe2's protobuf format.
+        The returned object can be saved with its :meth:`.save_protobuf()` method.
+        The result can be loaded and executed using Caffe2 runtime.
+        Returns:
+            :class:`Caffe2Model`
+        """
+        from .caffe2_export import export_caffe2_detection_model
+        predict_net, init_net = export_caffe2_detection_model(
+            self.traceable_model, self.traceable_inputs
+        )
+        return Caffe2Model(predict_net, init_net)
+    def export_onnx(self):
+        """
+        Export the model to ONNX format.
+        Note that the exported model contains custom ops only available in caffe2, therefore it
+        cannot be directly executed by other runtime (such as onnxruntime or TensorRT).
+        Post-processing or transformation passes may be applied on the model to accommodate
+        different runtimes, but we currently do not provide support for them.
+        Returns:
+            onnx.ModelProto: an onnx model.
+        """
+        from .caffe2_export import export_onnx_model as export_onnx_model_impl
+        return export_onnx_model_impl(self.traceable_model, (self.traceable_inputs,))
+    def export_torchscript(self):
+        """
+        Export the model to a ``torch.jit.TracedModule`` by tracing.
+        The returned object can be saved to a file by ``.save()``.
+        Returns:
+            torch.jit.TracedModule: a torch TracedModule
+        """
+        logger = logging.getLogger(__name__)
+        logger.info("Tracing the model with torch.jit.trace ...")
+        with torch.no_grad():
+            return torch.jit.trace(self.traceable_model, (self.traceable_inputs,))
+class Caffe2Model(nn.Module):
+    """
+    A wrapper around the traced model in Caffe2's protobuf format.
+    The exported graph has different inputs/outputs from the original Pytorch
+    model, as explained in :class:`Caffe2Tracer`. This class wraps around the
+    exported graph to simulate the same interface as the original Pytorch model.
+    It also provides functions to save/load models in Caffe2's format.'
+    Examples:
+    ::
+        c2_model = Caffe2Tracer(cfg, torch_model, inputs).export_caffe2()
+        inputs = [{"image": img_tensor_CHW}]
+        outputs = c2_model(inputs)
+        orig_outputs = torch_model(inputs)
+    """
+    def __init__(self, predict_net, init_net):
+        super().__init__()
+        self.eval()  # always in eval mode
+        self._predict_net = predict_net
+        self._init_net = init_net
+        self._predictor = None
+    __init__.__HIDE_SPHINX_DOC__ = True
+    @property
+    def predict_net(self):
+        """
+        caffe2.core.Net: the underlying caffe2 predict net
+        """
+        return self._predict_net
+    @property
+    def init_net(self):
+        """
+        caffe2.core.Net: the underlying caffe2 init net
+        """
+        return self._init_net
+    def save_protobuf(self, output_dir):
+        """
+        Save the model as caffe2's protobuf format.
+        It saves the following files:
+            * "model.pb": definition of the graph. Can be visualized with
+              tools like `netron <https://github.com/lutzroeder/netron>`_.
+            * "model_init.pb": model parameters
+            * "model.pbtxt": human-readable definition of the graph. Not
+              needed for deployment.
+        Args:
+            output_dir (str): the output directory to save protobuf files.
+        """
+        logger = logging.getLogger(__name__)
+        logger.info("Saving model to {} ...".format(output_dir))
+        if not PathManager.exists(output_dir):
+            PathManager.mkdirs(output_dir)
+        with PathManager.open(os.path.join(output_dir, "model.pb"), "wb") as f:
+            f.write(self._predict_net.SerializeToString())
+        with PathManager.open(os.path.join(output_dir, "model.pbtxt"), "w") as f:
+            f.write(str(self._predict_net))
+        with PathManager.open(os.path.join(output_dir, "model_init.pb"), "wb") as f:
+            f.write(self._init_net.SerializeToString())
+    def save_graph(self, output_file, inputs=None):
+        """
+        Save the graph as SVG format.
+        Args:
+            output_file (str): a SVG file
+            inputs: optional inputs given to the model.
+                If given, the inputs will be used to run the graph to record
+                shape of every tensor. The shape information will be
+                saved together with the graph.
+        """
+        from .caffe2_export import run_and_save_graph
+        if inputs is None:
+            save_graph(self._predict_net, output_file, op_only=False)
+        else:
+            size_divisibility = get_pb_arg_vali(self._predict_net, "size_divisibility", 0)
+            device = get_pb_arg_vals(self._predict_net, "device", b"cpu").decode("ascii")
+            inputs = convert_batched_inputs_to_c2_format(inputs, size_divisibility, device)
+            inputs = [x.cpu().numpy() for x in inputs]
+            run_and_save_graph(self._predict_net, self._init_net, inputs, output_file)
+    @staticmethod
+    def load_protobuf(dir):
+        """
+        Args:
+            dir (str): a directory used to save Caffe2Model with
+                :meth:`save_protobuf`.
+                The files "model.pb" and "model_init.pb" are needed.
+        Returns:
+            Caffe2Model: the caffe2 model loaded from this directory.
+        """
+        predict_net = caffe2_pb2.NetDef()
+        with PathManager.open(os.path.join(dir, "model.pb"), "rb") as f:
+            predict_net.ParseFromString(f.read())
+        init_net = caffe2_pb2.NetDef()
+        with PathManager.open(os.path.join(dir, "model_init.pb"), "rb") as f:
+            init_net.ParseFromString(f.read())
+        return Caffe2Model(predict_net, init_net)
+    def __call__(self, inputs):
+        """
+        An interface that wraps around a Caffe2 model and mimics detectron2's models'
+        input/output format. See details about the format at :doc:`/tutorials/models`.
+        This is used to compare the outputs of caffe2 model with its original torch model.
+        Due to the extra conversion between Pytorch/Caffe2, this method is not meant for
+        benchmark. Because of the conversion, this method also has dependency
+        on detectron2 in order to convert to detectron2's output format.
+        """
+        if self._predictor is None:
+            self._predictor = ProtobufDetectionModel(self._predict_net, self._init_net)
+        return self._predictor(inputs)
--- a/detectron2/export/c10.py
+++ b/detectron2/export/c10.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+import math
+from typing import Dict
+import torch
+import torch.nn.functional as F
+from detectron2.layers import ShapeSpec, cat
+from detectron2.layers.roi_align_rotated import ROIAlignRotated
+from detectron2.modeling import poolers
+from detectron2.modeling.proposal_generator import rpn
+from detectron2.modeling.roi_heads.mask_head import mask_rcnn_inference
+from detectron2.structures import Boxes, ImageList, Instances, Keypoints, RotatedBoxes
+from .shared import alias, to_device
+"""
+This file contains caffe2-compatible implementation of several detectron2 components.
+"""
+class Caffe2Boxes(Boxes):
+    """
+    Representing a list of detectron2.structures.Boxes from minibatch, each box
+    is represented by a 5d vector (batch index + 4 coordinates), or a 6d vector
+    (batch index + 5 coordinates) for RotatedBoxes.
+    """
+    def __init__(self, tensor):
+        assert isinstance(tensor, torch.Tensor)
+        assert tensor.dim() == 2 and tensor.size(-1) in [4, 5, 6], tensor.size()
+        # TODO: make tensor immutable when dim is Nx5 for Boxes,
+        # and Nx6 for RotatedBoxes?
+        self.tensor = tensor
+# TODO clean up this class, maybe just extend Instances
+class InstancesList:
+    """
+    Tensor representation of a list of Instances object for a batch of images.
+    When dealing with a batch of images with Caffe2 ops, a list of bboxes
+    (instances) are usually represented by single Tensor with size
+    (sigma(Ni), 5) or (sigma(Ni), 4) plus a batch split Tensor. This class is
+    for providing common functions to convert between these two representations.
+    """
+    def __init__(self, im_info, indices, extra_fields=None):
+        # [N, 3] -> (H, W, Scale)
+        self.im_info = im_info
+        # [N,] -> indice of batch to which the instance belongs
+        self.indices = indices
+        # [N, ...]
+        self.batch_extra_fields = extra_fields or {}
+        self.image_size = self.im_info
+    def get_fields(self):
+        """like `get_fields` in the Instances object,
+        but return each field in tensor representations"""
+        ret = {}
+        for k, v in self.batch_extra_fields.items():
+            # if isinstance(v, torch.Tensor):
+            #     tensor_rep = v
+            # elif isinstance(v, (Boxes, Keypoints)):
+            #     tensor_rep = v.tensor
+            # else:
+            #     raise ValueError("Can't find tensor representation for: {}".format())
+            ret[k] = v
+        return ret
+    def has(self, name):
+        return name in self.batch_extra_fields
+    def set(self, name, value):
+        # len(tensor) is a bad practice that generates ONNX constants during tracing.
+        # Although not a problem for the `assert` statement below, torch ONNX exporter
+        # still raises a misleading warning as it does not this call comes from `assert`
+        if isinstance(value, Boxes):
+            data_len = value.tensor.shape[0]
+        elif isinstance(value, torch.Tensor):
+            data_len = value.shape[0]
+        else:
+            data_len = len(value)
+        if len(self.batch_extra_fields):
+            assert (
+                len(self) == data_len
+            ), "Adding a field of length {} to a Instances of length {}".format(data_len, len(self))
+        self.batch_extra_fields[name] = value
+    def __getattr__(self, name):
+        if name not in self.batch_extra_fields:
+            raise AttributeError("Cannot find field '{}' in the given Instances!".format(name))
+        return self.batch_extra_fields[name]
+    def __len__(self):
+        return len(self.indices)
+    def flatten(self):
+        ret = []
+        for _, v in self.batch_extra_fields.items():
+            if isinstance(v, (Boxes, Keypoints)):
+                ret.append(v.tensor)
+            else:
+                ret.append(v)
+        return ret
+    @staticmethod
+    def to_d2_instances_list(instances_list):
+        """
+        Convert InstancesList to List[Instances]. The input `instances_list` can
+        also be a List[Instances], in this case this method is a non-op.
+        """
+        if not isinstance(instances_list, InstancesList):
+            assert all(isinstance(x, Instances) for x in instances_list)
+            return instances_list
+        ret = []
+        for i, info in enumerate(instances_list.im_info):
+            instances = Instances(torch.Size([int(info[0].item()), int(info[1].item())]))
+            ids = instances_list.indices == i
+            for k, v in instances_list.batch_extra_fields.items():
+                if isinstance(v, torch.Tensor):
+                    instances.set(k, v[ids])
+                    continue
+                elif isinstance(v, Boxes):
+                    instances.set(k, v[ids, -4:])
+                    continue
+                target_type, tensor_source = v
+                assert isinstance(tensor_source, torch.Tensor)
+                assert tensor_source.shape[0] == instances_list.indices.shape[0]
+                tensor_source = tensor_source[ids]
+                if issubclass(target_type, Boxes):
+                    instances.set(k, Boxes(tensor_source[:, -4:]))
+                elif issubclass(target_type, Keypoints):
+                    instances.set(k, Keypoints(tensor_source))
+                elif issubclass(target_type, torch.Tensor):
+                    instances.set(k, tensor_source)
+                else:
+                    raise ValueError("Can't handle targe type: {}".format(target_type))
+            ret.append(instances)
+        return ret
+class Caffe2Compatible:
+    """
+    A model can inherit this class to indicate that it can be traced and deployed with caffe2.
+    """
+    def _get_tensor_mode(self):
+        return self._tensor_mode
+    def _set_tensor_mode(self, v):
+        self._tensor_mode = v
+    tensor_mode = property(_get_tensor_mode, _set_tensor_mode)
+    """
+    If true, the model expects C2-style tensor only inputs/outputs format.
+    """
+class Caffe2RPN(Caffe2Compatible, rpn.RPN):
+    @classmethod
+    def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
+        ret = super(Caffe2Compatible, cls).from_config(cfg, input_shape)
+        assert tuple(cfg.MODEL.RPN.BBOX_REG_WEIGHTS) == (1.0, 1.0, 1.0, 1.0) or tuple(
+            cfg.MODEL.RPN.BBOX_REG_WEIGHTS
+        ) == (1.0, 1.0, 1.0, 1.0, 1.0)
+        return ret
+    def _generate_proposals(
+        self, images, objectness_logits_pred, anchor_deltas_pred, gt_instances=None
+    ):
+        assert isinstance(images, ImageList)
+        if self.tensor_mode:
+            im_info = images.image_sizes
+        else:
+            im_info = torch.tensor([[im_sz[0], im_sz[1], 1.0] for im_sz in images.image_sizes]).to(
+                images.tensor.device
+            )
+        assert isinstance(im_info, torch.Tensor)
+        rpn_rois_list = []
+        rpn_roi_probs_list = []
+        for scores, bbox_deltas, cell_anchors_tensor, feat_stride in zip(
+            objectness_logits_pred,
+            anchor_deltas_pred,
+            [b for (n, b) in self.anchor_generator.cell_anchors.named_buffers()],
+            self.anchor_generator.strides,
+        ):
+            scores = scores.detach()
+            bbox_deltas = bbox_deltas.detach()
+            rpn_rois, rpn_roi_probs = torch.ops._caffe2.GenerateProposals(
+                scores,
+                bbox_deltas,
+                im_info,
+                cell_anchors_tensor,
+                spatial_scale=1.0 / feat_stride,
+                pre_nms_topN=self.pre_nms_topk[self.training],
+                post_nms_topN=self.post_nms_topk[self.training],
+                nms_thresh=self.nms_thresh,
+                min_size=self.min_box_size,
+                # correct_transform_coords=True,  # deprecated argument
+                angle_bound_on=True,  # Default
+                angle_bound_lo=-180,
+                angle_bound_hi=180,
+                clip_angle_thresh=1.0,  # Default
+                legacy_plus_one=False,
+            )
+            rpn_rois_list.append(rpn_rois)
+            rpn_roi_probs_list.append(rpn_roi_probs)
+        # For FPN in D2, in RPN all proposals from different levels are concated
+        # together, ranked and picked by top post_nms_topk. Then in ROIPooler
+        # it calculates level_assignments and calls the RoIAlign from
+        # the corresponding level.
+        if len(objectness_logits_pred) == 1:
+            rpn_rois = rpn_rois_list[0]
+            rpn_roi_probs = rpn_roi_probs_list[0]
+        else:
+            assert len(rpn_rois_list) == len(rpn_roi_probs_list)
+            rpn_post_nms_topN = self.post_nms_topk[self.training]
+            device = rpn_rois_list[0].device
+            input_list = [to_device(x, "cpu") for x in (rpn_rois_list + rpn_roi_probs_list)]
+            # TODO remove this after confirming rpn_max_level/rpn_min_level
+            # is not needed in CollectRpnProposals.
+            feature_strides = list(self.anchor_generator.strides)
+            rpn_min_level = int(math.log2(feature_strides[0]))
+            rpn_max_level = int(math.log2(feature_strides[-1]))
+            assert (rpn_max_level - rpn_min_level + 1) == len(
+                rpn_rois_list
+            ), "CollectRpnProposals requires continuous levels"
+            rpn_rois = torch.ops._caffe2.CollectRpnProposals(
+                input_list,
+                # NOTE: in current implementation, rpn_max_level and rpn_min_level
+                # are not needed, only the subtraction of two matters and it
+                # can be infer from the number of inputs. Keep them now for
+                # consistency.
+                rpn_max_level=2 + len(rpn_rois_list) - 1,
+                rpn_min_level=2,
+                rpn_post_nms_topN=rpn_post_nms_topN,
+            )
+            rpn_rois = to_device(rpn_rois, device)
+            rpn_roi_probs = []
+        proposals = self.c2_postprocess(im_info, rpn_rois, rpn_roi_probs, self.tensor_mode)
+        return proposals, {}
+    def forward(self, images, features, gt_instances=None):
+        assert not self.training
+        features = [features[f] for f in self.in_features]
+        objectness_logits_pred, anchor_deltas_pred = self.rpn_head(features)
+        return self._generate_proposals(
+            images,
+            objectness_logits_pred,
+            anchor_deltas_pred,
+            gt_instances,
+        )
+    @staticmethod
+    def c2_postprocess(im_info, rpn_rois, rpn_roi_probs, tensor_mode):
+        proposals = InstancesList(
+            im_info=im_info,
+            indices=rpn_rois[:, 0],
+            extra_fields={
+                "proposal_boxes": Caffe2Boxes(rpn_rois),
+                "objectness_logits": (torch.Tensor, rpn_roi_probs),
+            },
+        )
+        if not tensor_mode:
+            proposals = InstancesList.to_d2_instances_list(proposals)
+        else:
+            proposals = [proposals]
+        return proposals
+class Caffe2ROIPooler(Caffe2Compatible, poolers.ROIPooler):
+    @staticmethod
+    def c2_preprocess(box_lists):
+        assert all(isinstance(x, Boxes) for x in box_lists)
+        if all(isinstance(x, Caffe2Boxes) for x in box_lists):
+            # input is pure-tensor based
+            assert len(box_lists) == 1
+            pooler_fmt_boxes = box_lists[0].tensor
+        else:
+            pooler_fmt_boxes = poolers.convert_boxes_to_pooler_format(box_lists)
+        return pooler_fmt_boxes
+    def forward(self, x, box_lists):
+        assert not self.training
+        pooler_fmt_boxes = self.c2_preprocess(box_lists)
+        num_level_assignments = len(self.level_poolers)
+        if num_level_assignments == 1:
+            if isinstance(self.level_poolers[0], ROIAlignRotated):
+                c2_roi_align = torch.ops._caffe2.RoIAlignRotated
+                aligned = True
+            else:
+                c2_roi_align = torch.ops._caffe2.RoIAlign
+                aligned = self.level_poolers[0].aligned
+            x0 = x[0]
+            if x0.is_quantized:
+                x0 = x0.dequantize()
+            out = c2_roi_align(
+                x0,
+                pooler_fmt_boxes,
+                order="NCHW",
+                spatial_scale=float(self.level_poolers[0].spatial_scale),
+                pooled_h=int(self.output_size[0]),
+                pooled_w=int(self.output_size[1]),
+                sampling_ratio=int(self.level_poolers[0].sampling_ratio),
+                aligned=aligned,
+            )
+            return out
+        device = pooler_fmt_boxes.device
+        assert (
+            self.max_level - self.min_level + 1 == 4
+        ), "Currently DistributeFpnProposals only support 4 levels"
+        fpn_outputs = torch.ops._caffe2.DistributeFpnProposals(
+            to_device(pooler_fmt_boxes, "cpu"),
+            roi_canonical_scale=self.canonical_box_size,
+            roi_canonical_level=self.canonical_level,
+            roi_max_level=self.max_level,
+            roi_min_level=self.min_level,
+            legacy_plus_one=False,
+        )
+        fpn_outputs = [to_device(x, device) for x in fpn_outputs]
+        rois_fpn_list = fpn_outputs[:-1]
+        rois_idx_restore_int32 = fpn_outputs[-1]
+        roi_feat_fpn_list = []
+        for roi_fpn, x_level, pooler in zip(rois_fpn_list, x, self.level_poolers):
+            if isinstance(pooler, ROIAlignRotated):
+                c2_roi_align = torch.ops._caffe2.RoIAlignRotated
+                aligned = True
+            else:
+                c2_roi_align = torch.ops._caffe2.RoIAlign
+                aligned = bool(pooler.aligned)
+            if x_level.is_quantized:
+                x_level = x_level.dequantize()
+            roi_feat_fpn = c2_roi_align(
+                x_level,
+                roi_fpn,
+                order="NCHW",
+                spatial_scale=float(pooler.spatial_scale),
+                pooled_h=int(self.output_size[0]),
+                pooled_w=int(self.output_size[1]),
+                sampling_ratio=int(pooler.sampling_ratio),
+                aligned=aligned,
+            )
+            roi_feat_fpn_list.append(roi_feat_fpn)
+        roi_feat_shuffled = cat(roi_feat_fpn_list, dim=0)
+        assert roi_feat_shuffled.numel() > 0 and rois_idx_restore_int32.numel() > 0, (
+            "Caffe2 export requires tracing with a model checkpoint + input that can produce valid"
+            " detections. But no detections were obtained with the given checkpoint and input!"
+        )
+        roi_feat = torch.ops._caffe2.BatchPermutation(roi_feat_shuffled, rois_idx_restore_int32)
+        return roi_feat
+def caffe2_fast_rcnn_outputs_inference(tensor_mode, box_predictor, predictions, proposals):
+    """equivalent to FastRCNNOutputLayers.inference"""
+    num_classes = box_predictor.num_classes
+    score_thresh = box_predictor.test_score_thresh
+    nms_thresh = box_predictor.test_nms_thresh
+    topk_per_image = box_predictor.test_topk_per_image
+    is_rotated = len(box_predictor.box2box_transform.weights) == 5
+    if is_rotated:
+        box_dim = 5
+        assert box_predictor.box2box_transform.weights[4] == 1, (
+            "The weights for Rotated BBoxTransform in C2 have only 4 dimensions,"
+            + " thus enforcing the angle weight to be 1 for now"
+        )
+        box2box_transform_weights = box_predictor.box2box_transform.weights[:4]
+    else:
+        box_dim = 4
+        box2box_transform_weights = box_predictor.box2box_transform.weights
+    class_logits, box_regression = predictions
+    if num_classes + 1 == class_logits.shape[1]:
+        class_prob = F.softmax(class_logits, -1)
+    else:
+        assert num_classes == class_logits.shape[1]
+        class_prob = F.sigmoid(class_logits)
+        # BoxWithNMSLimit will infer num_classes from the shape of the class_prob
+        # So append a zero column as placeholder for the background class
+        class_prob = torch.cat((class_prob, torch.zeros(class_prob.shape[0], 1)), dim=1)
+    assert box_regression.shape[1] % box_dim == 0
+    cls_agnostic_bbox_reg = box_regression.shape[1] // box_dim == 1
+    input_tensor_mode = proposals[0].proposal_boxes.tensor.shape[1] == box_dim + 1
+    proposal_boxes = proposals[0].proposal_boxes
+    if isinstance(proposal_boxes, Caffe2Boxes):
+        rois = Caffe2Boxes.cat([p.proposal_boxes for p in proposals])
+    elif isinstance(proposal_boxes, RotatedBoxes):
+        rois = RotatedBoxes.cat([p.proposal_boxes for p in proposals])
+    elif isinstance(proposal_boxes, Boxes):
+        rois = Boxes.cat([p.proposal_boxes for p in proposals])
+    else:
+        raise NotImplementedError(
+            'Expected proposals[0].proposal_boxes to be type "Boxes", '
+            f"instead got {type(proposal_boxes)}"
+        )
+    device, dtype = rois.tensor.device, rois.tensor.dtype
+    if input_tensor_mode:
+        im_info = proposals[0].image_size
+        rois = rois.tensor
+    else:
+        im_info = torch.tensor([[sz[0], sz[1], 1.0] for sz in [x.image_size for x in proposals]])
+        batch_ids = cat(
+            [
+                torch.full((b, 1), i, dtype=dtype, device=device)
+                for i, b in enumerate(len(p) for p in proposals)
+            ],
+            dim=0,
+        )
+        rois = torch.cat([batch_ids, rois.tensor], dim=1)
+    roi_pred_bbox, roi_batch_splits = torch.ops._caffe2.BBoxTransform(
+        to_device(rois, "cpu"),
+        to_device(box_regression, "cpu"),
+        to_device(im_info, "cpu"),
+        weights=box2box_transform_weights,
+        apply_scale=True,
+        rotated=is_rotated,
+        angle_bound_on=True,
+        angle_bound_lo=-180,
+        angle_bound_hi=180,
+        clip_angle_thresh=1.0,
+        legacy_plus_one=False,
+    )
+    roi_pred_bbox = to_device(roi_pred_bbox, device)
+    roi_batch_splits = to_device(roi_batch_splits, device)
+    nms_outputs = torch.ops._caffe2.BoxWithNMSLimit(
+        to_device(class_prob, "cpu"),
+        to_device(roi_pred_bbox, "cpu"),
+        to_device(roi_batch_splits, "cpu"),
+        score_thresh=float(score_thresh),
+        nms=float(nms_thresh),
+        detections_per_im=int(topk_per_image),
+        soft_nms_enabled=False,
+        soft_nms_method="linear",
+        soft_nms_sigma=0.5,
+        soft_nms_min_score_thres=0.001,
+        rotated=is_rotated,
+        cls_agnostic_bbox_reg=cls_agnostic_bbox_reg,
+        input_boxes_include_bg_cls=False,
+        output_classes_include_bg_cls=False,
+        legacy_plus_one=False,
+    )
+    roi_score_nms = to_device(nms_outputs[0], device)
+    roi_bbox_nms = to_device(nms_outputs[1], device)
+    roi_class_nms = to_device(nms_outputs[2], device)
+    roi_batch_splits_nms = to_device(nms_outputs[3], device)
+    roi_keeps_nms = to_device(nms_outputs[4], device)
+    roi_keeps_size_nms = to_device(nms_outputs[5], device)
+    if not tensor_mode:
+        roi_class_nms = roi_class_nms.to(torch.int64)
+    roi_batch_ids = cat(
+        [
+            torch.full((b, 1), i, dtype=dtype, device=device)
+            for i, b in enumerate(int(x.item()) for x in roi_batch_splits_nms)
+        ],
+        dim=0,
+    )
+    roi_class_nms = alias(roi_class_nms, "class_nms")
+    roi_score_nms = alias(roi_score_nms, "score_nms")
+    roi_bbox_nms = alias(roi_bbox_nms, "bbox_nms")
+    roi_batch_splits_nms = alias(roi_batch_splits_nms, "batch_splits_nms")
+    roi_keeps_nms = alias(roi_keeps_nms, "keeps_nms")
+    roi_keeps_size_nms = alias(roi_keeps_size_nms, "keeps_size_nms")
+    results = InstancesList(
+        im_info=im_info,
+        indices=roi_batch_ids[:, 0],
+        extra_fields={
+            "pred_boxes": Caffe2Boxes(roi_bbox_nms),
+            "scores": roi_score_nms,
+            "pred_classes": roi_class_nms,
+        },
+    )
+    if not tensor_mode:
+        results = InstancesList.to_d2_instances_list(results)
+        batch_splits = roi_batch_splits_nms.int().tolist()
+        kept_indices = list(roi_keeps_nms.to(torch.int64).split(batch_splits))
+    else:
+        results = [results]
+        kept_indices = [roi_keeps_nms]
+    return results, kept_indices
+class Caffe2FastRCNNOutputsInference:
+    def __init__(self, tensor_mode):
+        self.tensor_mode = tensor_mode  # whether the output is caffe2 tensor mode
+    def __call__(self, box_predictor, predictions, proposals):
+        return caffe2_fast_rcnn_outputs_inference(
+            self.tensor_mode, box_predictor, predictions, proposals
+        )
+def caffe2_mask_rcnn_inference(pred_mask_logits, pred_instances):
+    """equivalent to mask_head.mask_rcnn_inference"""
+    if all(isinstance(x, InstancesList) for x in pred_instances):
+        assert len(pred_instances) == 1
+        mask_probs_pred = pred_mask_logits.sigmoid()
+        mask_probs_pred = alias(mask_probs_pred, "mask_fcn_probs")
+        pred_instances[0].set("pred_masks", mask_probs_pred)
+    else:
+        mask_rcnn_inference(pred_mask_logits, pred_instances)
+class Caffe2MaskRCNNInference:
+    def __call__(self, pred_mask_logits, pred_instances):
+        return caffe2_mask_rcnn_inference(pred_mask_logits, pred_instances)
+def caffe2_keypoint_rcnn_inference(use_heatmap_max_keypoint, pred_keypoint_logits, pred_instances):
+    # just return the keypoint heatmap for now,
+    # there will be option to call HeatmapMaxKeypointOp
+    output = alias(pred_keypoint_logits, "kps_score")
+    if all(isinstance(x, InstancesList) for x in pred_instances):
+        assert len(pred_instances) == 1
+        if use_heatmap_max_keypoint:
+            device = output.device
+            output = torch.ops._caffe2.HeatmapMaxKeypoint(
+                to_device(output, "cpu"),
+                pred_instances[0].pred_boxes.tensor,
+                should_output_softmax=True,  # worth make it configerable?
+            )
+            output = to_device(output, device)
+            output = alias(output, "keypoints_out")
+        pred_instances[0].set("pred_keypoints", output)
+    return pred_keypoint_logits
+class Caffe2KeypointRCNNInference:
+    def __init__(self, use_heatmap_max_keypoint):
+        self.use_heatmap_max_keypoint = use_heatmap_max_keypoint
+    def __call__(self, pred_keypoint_logits, pred_instances):
+        return caffe2_keypoint_rcnn_inference(
+            self.use_heatmap_max_keypoint, pred_keypoint_logits, pred_instances
+        )
--- a/detectron2/export/caffe2_export.py
+++ b/detectron2/export/caffe2_export.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+import copy
+import io
+import logging
+import numpy as np
+from typing import List
+import onnx
+import onnx.optimizer
+import torch
+from caffe2.proto import caffe2_pb2
+from caffe2.python import core
+from caffe2.python.onnx.backend import Caffe2Backend
+from tabulate import tabulate
+from termcolor import colored
+from torch.onnx import OperatorExportTypes
+from .shared import (
+    ScopedWS,
+    construct_init_net_from_params,
+    fuse_alias_placeholder,
+    fuse_copy_between_cpu_and_gpu,
+    get_params_from_init_net,
+    group_norm_replace_aten_with_caffe2,
+    infer_device_type,
+    remove_dead_end_ops,
+    remove_reshape_for_fc,
+    save_graph,
+)
+logger = logging.getLogger(__name__)
+def export_onnx_model(model, inputs):
+    """
+    Trace and export a model to onnx format.
+    Args:
+        model (nn.Module):
+        inputs (tuple[args]): the model will be called by `model(*inputs)`
+    Returns:
+        an onnx model
+    """
+    assert isinstance(model, torch.nn.Module)
+    # make sure all modules are in eval mode, onnx may change the training state
+    # of the module if the states are not consistent
+    def _check_eval(module):
+        assert not module.training
+    model.apply(_check_eval)
+    # Export the model to ONNX
+    with torch.no_grad():
+        with io.BytesIO() as f:
+            torch.onnx.export(
+                model,
+                inputs,
+                f,
+                operator_export_type=OperatorExportTypes.ONNX_ATEN_FALLBACK,
+                # verbose=True,  # NOTE: uncomment this for debugging
+                # export_params=True,
+            )
+            onnx_model = onnx.load_from_string(f.getvalue())
+    return onnx_model
+def _op_stats(net_def):
+    type_count = {}
+    for t in [op.type for op in net_def.op]:
+        type_count[t] = type_count.get(t, 0) + 1
+    type_count_list = sorted(type_count.items(), key=lambda kv: kv[0])  # alphabet
+    type_count_list = sorted(type_count_list, key=lambda kv: -kv[1])  # count
+    return "\n".join("{:>4}x {}".format(count, name) for name, count in type_count_list)
+def _assign_device_option(
+    predict_net: caffe2_pb2.NetDef, init_net: caffe2_pb2.NetDef, tensor_inputs: List[torch.Tensor]
+):
+    """
+    ONNX exported network doesn't have concept of device, assign necessary
+    device option for each op in order to make it runable on GPU runtime.
+    """
+    def _get_device_type(torch_tensor):
+        assert torch_tensor.device.type in ["cpu", "cuda"]
+        assert torch_tensor.device.index == 0
+        return torch_tensor.device.type
+    def _assign_op_device_option(net_proto, net_ssa, blob_device_types):
+        for op, ssa_i in zip(net_proto.op, net_ssa):
+            if op.type in ["CopyCPUToGPU", "CopyGPUToCPU"]:
+                op.device_option.CopyFrom(core.DeviceOption(caffe2_pb2.CUDA, 0))
+            else:
+                devices = [blob_device_types[b] for b in ssa_i[0] + ssa_i[1]]
+                assert all(d == devices[0] for d in devices)
+                if devices[0] == "cuda":
+                    op.device_option.CopyFrom(core.DeviceOption(caffe2_pb2.CUDA, 0))
+    # update ops in predict_net
+    predict_net_input_device_types = {
+        (name, 0): _get_device_type(tensor)
+        for name, tensor in zip(predict_net.external_input, tensor_inputs)
+    }
+    predict_net_device_types = infer_device_type(
+        predict_net, known_status=predict_net_input_device_types, device_name_style="pytorch"
+    )
+    predict_net_ssa, _ = core.get_ssa(predict_net)
+    _assign_op_device_option(predict_net, predict_net_ssa, predict_net_device_types)
+    # update ops in init_net
+    init_net_ssa, versions = core.get_ssa(init_net)
+    init_net_output_device_types = {
+        (name, versions[name]): predict_net_device_types[(name, 0)]
+        for name in init_net.external_output
+    }
+    init_net_device_types = infer_device_type(
+        init_net, known_status=init_net_output_device_types, device_name_style="pytorch"
+    )
+    _assign_op_device_option(init_net, init_net_ssa, init_net_device_types)
+def export_caffe2_detection_model(model: torch.nn.Module, tensor_inputs: List[torch.Tensor]):
+    """
+    Export a caffe2-compatible Detectron2 model to caffe2 format via ONNX.
+    Arg:
+        model: a caffe2-compatible version of detectron2 model, defined in caffe2_modeling.py
+        tensor_inputs: a list of tensors that caffe2 model takes as input.
+    """
+    model = copy.deepcopy(model)
+    assert isinstance(model, torch.nn.Module)
+    assert hasattr(model, "encode_additional_info")
+    # Export via ONNX
+    logger.info(
+        "Exporting a {} model via ONNX ...".format(type(model).__name__)
+        + " Some warnings from ONNX are expected and are usually not to worry about."
+    )
+    onnx_model = export_onnx_model(model, (tensor_inputs,))
+    # Convert ONNX model to Caffe2 protobuf
+    init_net, predict_net = Caffe2Backend.onnx_graph_to_caffe2_net(onnx_model)
+    ops_table = [[op.type, op.input, op.output] for op in predict_net.op]
+    table = tabulate(ops_table, headers=["type", "input", "output"], tablefmt="pipe")
+    logger.info(
+        "ONNX export Done. Exported predict_net (before optimizations):\n" + colored(table, "cyan")
+    )
+    # Apply protobuf optimization
+    fuse_alias_placeholder(predict_net, init_net)
+    if any(t.device.type != "cpu" for t in tensor_inputs):
+        fuse_copy_between_cpu_and_gpu(predict_net)
+        remove_dead_end_ops(init_net)
+        _assign_device_option(predict_net, init_net, tensor_inputs)
+    params, device_options = get_params_from_init_net(init_net)
+    predict_net, params = remove_reshape_for_fc(predict_net, params)
+    init_net = construct_init_net_from_params(params, device_options)
+    group_norm_replace_aten_with_caffe2(predict_net)
+    # Record necessary information for running the pb model in Detectron2 system.
+    model.encode_additional_info(predict_net, init_net)
+    logger.info("Operators used in predict_net: \n{}".format(_op_stats(predict_net)))
+    logger.info("Operators used in init_net: \n{}".format(_op_stats(init_net)))
+    return predict_net, init_net
+def run_and_save_graph(predict_net, init_net, tensor_inputs, graph_save_path):
+    """
+    Run the caffe2 model on given inputs, recording the shape and draw the graph.
+    predict_net/init_net: caffe2 model.
+    tensor_inputs: a list of tensors that caffe2 model takes as input.
+    graph_save_path: path for saving graph of exported model.
+    """
+    logger.info("Saving graph of ONNX exported model to {} ...".format(graph_save_path))
+    save_graph(predict_net, graph_save_path, op_only=False)
+    # Run the exported Caffe2 net
+    logger.info("Running ONNX exported model ...")
+    with ScopedWS("__ws_tmp__", True) as ws:
+        ws.RunNetOnce(init_net)
+        initialized_blobs = set(ws.Blobs())
+        uninitialized = [inp for inp in predict_net.external_input if inp not in initialized_blobs]
+        for name, blob in zip(uninitialized, tensor_inputs):
+            ws.FeedBlob(name, blob)
+        try:
+            ws.RunNetOnce(predict_net)
+        except RuntimeError as e:
+            logger.warning("Encountered RuntimeError: \n{}".format(str(e)))
+        ws_blobs = {b: ws.FetchBlob(b) for b in ws.Blobs()}
+        blob_sizes = {b: ws_blobs[b].shape for b in ws_blobs if isinstance(ws_blobs[b], np.ndarray)}
+        logger.info("Saving graph with blob shapes to {} ...".format(graph_save_path))
+        save_graph(predict_net, graph_save_path, op_only=False, blob_sizes=blob_sizes)
+        return ws_blobs
--- a/detectron2/export/caffe2_inference.py
+++ b/detectron2/export/caffe2_inference.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+import logging
+import numpy as np
+from itertools import count
+import torch
+from caffe2.proto import caffe2_pb2
+from caffe2.python import core
+from .caffe2_modeling import META_ARCH_CAFFE2_EXPORT_TYPE_MAP, convert_batched_inputs_to_c2_format
+from .shared import ScopedWS, get_pb_arg_vali, get_pb_arg_vals, infer_device_type
+logger = logging.getLogger(__name__)
+# ===== ref: mobile-vision predictor's 'Caffe2Wrapper' class ======
+class ProtobufModel(torch.nn.Module):
+    """
+    Wrapper of a caffe2's protobuf model.
+    It works just like nn.Module, but running caffe2 under the hood.
+    Input/Output are tuple[tensor] that match the caffe2 net's external_input/output.
+    """
+    _ids = count(0)
+    def __init__(self, predict_net, init_net):
+        logger.info(f"Initializing ProtobufModel for: {predict_net.name} ...")
+        super().__init__()
+        assert isinstance(predict_net, caffe2_pb2.NetDef)
+        assert isinstance(init_net, caffe2_pb2.NetDef)
+        # create unique temporary workspace for each instance
+        self.ws_name = "__tmp_ProtobufModel_{}__".format(next(self._ids))
+        self.net = core.Net(predict_net)
+        logger.info("Running init_net once to fill the parameters ...")
+        with ScopedWS(self.ws_name, is_reset=True, is_cleanup=False) as ws:
+            ws.RunNetOnce(init_net)
+            uninitialized_external_input = []
+            for blob in self.net.Proto().external_input:
+                if blob not in ws.Blobs():
+                    uninitialized_external_input.append(blob)
+                    ws.CreateBlob(blob)
+            ws.CreateNet(self.net)
+        self._error_msgs = set()
+        self._input_blobs = uninitialized_external_input
+    def _infer_output_devices(self, inputs):
+        """
+        Returns:
+            list[str]: list of device for each external output
+        """
+        def _get_device_type(torch_tensor):
+            assert torch_tensor.device.type in ["cpu", "cuda"]
+            assert torch_tensor.device.index == 0
+            return torch_tensor.device.type
+        predict_net = self.net.Proto()
+        input_device_types = {
+            (name, 0): _get_device_type(tensor) for name, tensor in zip(self._input_blobs, inputs)
+        }
+        device_type_map = infer_device_type(
+            predict_net, known_status=input_device_types, device_name_style="pytorch"
+        )
+        ssa, versions = core.get_ssa(predict_net)
+        versioned_outputs = [(name, versions[name]) for name in predict_net.external_output]
+        output_devices = [device_type_map[outp] for outp in versioned_outputs]
+        return output_devices
+    def forward(self, inputs):
+        """
+        Args:
+            inputs (tuple[torch.Tensor])
+        Returns:
+            tuple[torch.Tensor]
+        """
+        assert len(inputs) == len(self._input_blobs), (
+            f"Length of inputs ({len(inputs)}) "
+            f"doesn't match the required input blobs: {self._input_blobs}"
+        )
+        with ScopedWS(self.ws_name, is_reset=False, is_cleanup=False) as ws:
+            for b, tensor in zip(self._input_blobs, inputs):
+                ws.FeedBlob(b, tensor)
+            try:
+                ws.RunNet(self.net.Proto().name)
+            except RuntimeError as e:
+                if not str(e) in self._error_msgs:
+                    self._error_msgs.add(str(e))
+                    logger.warning("Encountered new RuntimeError: \n{}".format(str(e)))
+                logger.warning("Catch the error and use partial results.")
+            c2_outputs = [ws.FetchBlob(b) for b in self.net.Proto().external_output]
+            # Remove outputs of current run, this is necessary in order to
+            # prevent fetching the result from previous run if the model fails
+            # in the middle.
+            for b in self.net.Proto().external_output:
+                # Needs to create uninitialized blob to make the net runable.
+                # This is "equivalent" to: ws.RemoveBlob(b) then ws.CreateBlob(b),
+                # but there'no such API.
+                ws.FeedBlob(b, f"{b}, a C++ native class of type nullptr (uninitialized).")
+        # Cast output to torch.Tensor on the desired device
+        output_devices = (
+            self._infer_output_devices(inputs)
+            if any(t.device.type != "cpu" for t in inputs)
+            else ["cpu" for _ in self.net.Proto().external_output]
+        )
+        outputs = []
+        for name, c2_output, device in zip(
+            self.net.Proto().external_output, c2_outputs, output_devices
+        ):
+            if not isinstance(c2_output, np.ndarray):
+                raise RuntimeError(
+                    "Invalid output for blob {}, received: {}".format(name, c2_output)
+                )
+            outputs.append(torch.tensor(c2_output).to(device=device))
+        return tuple(outputs)
+class ProtobufDetectionModel(torch.nn.Module):
+    """
+    A class works just like a pytorch meta arch in terms of inference, but running
+    caffe2 model under the hood.
+    """
+    def __init__(self, predict_net, init_net, *, convert_outputs=None):
+        """
+        Args:
+            predict_net, init_net (core.Net): caffe2 nets
+            convert_outptus (callable): a function that converts caffe2
+                outputs to the same format of the original pytorch model.
+                By default, use the one defined in the caffe2 meta_arch.
+        """
+        super().__init__()
+        self.protobuf_model = ProtobufModel(predict_net, init_net)
+        self.size_divisibility = get_pb_arg_vali(predict_net, "size_divisibility", 0)
+        self.device = get_pb_arg_vals(predict_net, "device", b"cpu").decode("ascii")
+        if convert_outputs is None:
+            meta_arch = get_pb_arg_vals(predict_net, "meta_architecture", b"GeneralizedRCNN")
+            meta_arch = META_ARCH_CAFFE2_EXPORT_TYPE_MAP[meta_arch.decode("ascii")]
+            self._convert_outputs = meta_arch.get_outputs_converter(predict_net, init_net)
+        else:
+            self._convert_outputs = convert_outputs
+    def _convert_inputs(self, batched_inputs):
+        # currently all models convert inputs in the same way
+        return convert_batched_inputs_to_c2_format(
+            batched_inputs, self.size_divisibility, self.device
+        )
+    def forward(self, batched_inputs):
+        c2_inputs = self._convert_inputs(batched_inputs)
+        c2_results = self.protobuf_model(c2_inputs)
+        c2_results = dict(zip(self.protobuf_model.net.Proto().external_output, c2_results))
+        return self._convert_outputs(batched_inputs, c2_inputs, c2_results)
--- a/detectron2/export/caffe2_modeling.py
+++ b/detectron2/export/caffe2_modeling.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+import functools
+import io
+import struct
+import types
+import torch
+from detectron2.modeling import meta_arch
+from detectron2.modeling.box_regression import Box2BoxTransform
+from detectron2.modeling.roi_heads import keypoint_head
+from detectron2.structures import Boxes, ImageList, Instances, RotatedBoxes
+from .c10 import Caffe2Compatible
+from .caffe2_patch import ROIHeadsPatcher, patch_generalized_rcnn
+from .shared import (
+    alias,
+    check_set_pb_arg,
+    get_pb_arg_floats,
+    get_pb_arg_valf,
+    get_pb_arg_vali,
+    get_pb_arg_vals,
+    mock_torch_nn_functional_interpolate,
+)
+def assemble_rcnn_outputs_by_name(image_sizes, tensor_outputs, force_mask_on=False):
+    """
+    A function to assemble caffe2 model's outputs (i.e. Dict[str, Tensor])
+    to detectron2's format (i.e. list of Instances instance).
+    This only works when the model follows the Caffe2 detectron's naming convention.
+    Args:
+        image_sizes (List[List[int, int]]): [H, W] of every image.
+        tensor_outputs (Dict[str, Tensor]): external_output to its tensor.
+        force_mask_on (Bool): if true, the it make sure there'll be pred_masks even
+            if the mask is not found from tensor_outputs (usually due to model crash)
+    """
+    results = [Instances(image_size) for image_size in image_sizes]
+    batch_splits = tensor_outputs.get("batch_splits", None)
+    if batch_splits:
+        raise NotImplementedError()
+    assert len(image_sizes) == 1
+    result = results[0]
+    bbox_nms = tensor_outputs["bbox_nms"]
+    score_nms = tensor_outputs["score_nms"]
+    class_nms = tensor_outputs["class_nms"]
+    # Detection will always success because Conv support 0-batch
+    assert bbox_nms is not None
+    assert score_nms is not None
+    assert class_nms is not None
+    if bbox_nms.shape[1] == 5:
+        result.pred_boxes = RotatedBoxes(bbox_nms)
+    else:
+        result.pred_boxes = Boxes(bbox_nms)
+    result.scores = score_nms
+    result.pred_classes = class_nms.to(torch.int64)
+    mask_fcn_probs = tensor_outputs.get("mask_fcn_probs", None)
+    if mask_fcn_probs is not None:
+        # finish the mask pred
+        mask_probs_pred = mask_fcn_probs
+        num_masks = mask_probs_pred.shape[0]
+        class_pred = result.pred_classes
+        indices = torch.arange(num_masks, device=class_pred.device)
+        mask_probs_pred = mask_probs_pred[indices, class_pred][:, None]
+        result.pred_masks = mask_probs_pred
+    elif force_mask_on:
+        # NOTE: there's no way to know the height/width of mask here, it won't be
+        # used anyway when batch size is 0, so just set them to 0.
+        result.pred_masks = torch.zeros([0, 1, 0, 0], dtype=torch.uint8)
+    keypoints_out = tensor_outputs.get("keypoints_out", None)
+    kps_score = tensor_outputs.get("kps_score", None)
+    if keypoints_out is not None:
+        # keypoints_out: [N, 4, #kypoints], where 4 is in order of (x, y, score, prob)
+        keypoints_tensor = keypoints_out
+        # NOTE: it's possible that prob is not calculated if "should_output_softmax"
+        # is set to False in HeatmapMaxKeypoint, so just using raw score, seems
+        # it doesn't affect mAP. TODO: check more carefully.
+        keypoint_xyp = keypoints_tensor.transpose(1, 2)[:, :, [0, 1, 2]]
+        result.pred_keypoints = keypoint_xyp
+    elif kps_score is not None:
+        # keypoint heatmap to sparse data structure
+        pred_keypoint_logits = kps_score
+        keypoint_head.keypoint_rcnn_inference(pred_keypoint_logits, [result])
+    return results
+def _cast_to_f32(f64):
+    return struct.unpack("f", struct.pack("f", f64))[0]
+def set_caffe2_compatible_tensor_mode(model, enable=True):
+    def _fn(m):
+        if isinstance(m, Caffe2Compatible):
+            m.tensor_mode = enable
+    model.apply(_fn)
+def convert_batched_inputs_to_c2_format(batched_inputs, size_divisibility, device):
+    """
+    See get_caffe2_inputs() below.
+    """
+    assert all(isinstance(x, dict) for x in batched_inputs)
+    assert all(x["image"].dim() == 3 for x in batched_inputs)
+    images = [x["image"] for x in batched_inputs]
+    images = ImageList.from_tensors(images, size_divisibility)
+    im_info = []
+    for input_per_image, image_size in zip(batched_inputs, images.image_sizes):
+        target_height = input_per_image.get("height", image_size[0])
+        target_width = input_per_image.get("width", image_size[1])  # noqa
+        # NOTE: The scale inside im_info is kept as convention and for providing
+        # post-processing information if further processing is needed. For
+        # current Caffe2 model definitions that don't include post-processing inside
+        # the model, this number is not used.
+        # NOTE: There can be a slight difference between width and height
+        # scales, using a single number can results in numerical difference
+        # compared with D2's post-processing.
+        scale = target_height / image_size[0]
+        im_info.append([image_size[0], image_size[1], scale])
+    im_info = torch.Tensor(im_info)
+    return images.tensor.to(device), im_info.to(device)
+class Caffe2MetaArch(Caffe2Compatible, torch.nn.Module):
+    """
+    Base class for caffe2-compatible implementation of a meta architecture.
+    The forward is traceable and its traced graph can be converted to caffe2
+    graph through ONNX.
+    """
+    def __init__(self, cfg, torch_model, enable_tensor_mode=True):
+        """
+        Args:
+            cfg (CfgNode):
+            torch_model (nn.Module): the detectron2 model (meta_arch) to be
+                converted.
+        """
+        super().__init__()
+        self._wrapped_model = torch_model
+        self.eval()
+        set_caffe2_compatible_tensor_mode(self, enable_tensor_mode)
+    def get_caffe2_inputs(self, batched_inputs):
+        """
+        Convert pytorch-style structured inputs to caffe2-style inputs that
+        are tuples of tensors.
+        Args:
+            batched_inputs (list[dict]): inputs to a detectron2 model
+                in its standard format. Each dict has "image" (CHW tensor), and optionally
+                "height" and "width".
+        Returns:
+            tuple[Tensor]:
+                tuple of tensors that will be the inputs to the
+                :meth:`forward` method. For existing models, the first
+                is an NCHW tensor (padded and batched); the second is
+                a im_info Nx3 tensor, where the rows are
+                (height, width, unused legacy parameter)
+        """
+        return convert_batched_inputs_to_c2_format(
+            batched_inputs,
+            self._wrapped_model.backbone.size_divisibility,
+            self._wrapped_model.device,
+        )
+    def encode_additional_info(self, predict_net, init_net):
+        """
+        Save extra metadata that will be used by inference in the output protobuf.
+        """
+        pass
+    def forward(self, inputs):
+        """
+        Run the forward in caffe2-style. It has to use caffe2-compatible ops
+        and the method will be used for tracing.
+        Args:
+            inputs (tuple[Tensor]): inputs defined by :meth:`get_caffe2_input`.
+                They will be the inputs of the converted caffe2 graph.
+        Returns:
+            tuple[Tensor]: output tensors. They will be the outputs of the
+                converted caffe2 graph.
+        """
+        raise NotImplementedError
+    def _caffe2_preprocess_image(self, inputs):
+        """
+        Caffe2 implementation of preprocess_image, which is called inside each MetaArch's forward.
+        It normalizes the input images, and the final caffe2 graph assumes the
+        inputs have been batched already.
+        """
+        data, im_info = inputs
+        data = alias(data, "data")
+        im_info = alias(im_info, "im_info")
+        mean, std = self._wrapped_model.pixel_mean, self._wrapped_model.pixel_std
+        normalized_data = (data - mean) / std
+        normalized_data = alias(normalized_data, "normalized_data")
+        # Pack (data, im_info) into ImageList which is recognized by self.inference.
+        images = ImageList(tensor=normalized_data, image_sizes=im_info)
+        return images
+    @staticmethod
+    def get_outputs_converter(predict_net, init_net):
+        """
+        Creates a function that converts outputs of the caffe2 model to
+        detectron2's standard format.
+        The function uses information in `predict_net` and `init_net` that are
+        available at inferene time. Therefore the function logic can be used in inference.
+        The returned function has the following signature:
+            def convert(batched_inputs, c2_inputs, c2_results) -> detectron2_outputs
+        Where
+            * batched_inputs (list[dict]): the original input format of the meta arch
+            * c2_inputs (tuple[Tensor]): the caffe2 inputs.
+            * c2_results (dict[str, Tensor]): the caffe2 output format,
+                corresponding to the outputs of the :meth:`forward` function.
+            * detectron2_outputs: the original output format of the meta arch.
+        This function can be used to compare the outputs of the original meta arch and
+        the converted caffe2 graph.
+        Returns:
+            callable: a callable of the above signature.
+        """
+        raise NotImplementedError
+class Caffe2GeneralizedRCNN(Caffe2MetaArch):
+    def __init__(self, cfg, torch_model, enable_tensor_mode=True):
+        assert isinstance(torch_model, meta_arch.GeneralizedRCNN)
+        torch_model = patch_generalized_rcnn(torch_model)
+        super().__init__(cfg, torch_model, enable_tensor_mode)
+        try:
+            use_heatmap_max_keypoint = cfg.EXPORT_CAFFE2.USE_HEATMAP_MAX_KEYPOINT
+        except AttributeError:
+            use_heatmap_max_keypoint = False
+        self.roi_heads_patcher = ROIHeadsPatcher(
+            self._wrapped_model.roi_heads, use_heatmap_max_keypoint
+        )
+        if self.tensor_mode:
+            self.roi_heads_patcher.patch_roi_heads()
+    def encode_additional_info(self, predict_net, init_net):
+        size_divisibility = self._wrapped_model.backbone.size_divisibility
+        check_set_pb_arg(predict_net, "size_divisibility", "i", size_divisibility)
+        check_set_pb_arg(
+            predict_net, "device", "s", str.encode(str(self._wrapped_model.device), "ascii")
+        )
+        check_set_pb_arg(predict_net, "meta_architecture", "s", b"GeneralizedRCNN")
+    @mock_torch_nn_functional_interpolate()
+    def forward(self, inputs):
+        if not self.tensor_mode:
+            return self._wrapped_model.inference(inputs)
+        images = self._caffe2_preprocess_image(inputs)
+        features = self._wrapped_model.backbone(images.tensor)
+        proposals, _ = self._wrapped_model.proposal_generator(images, features)
+        detector_results, _ = self._wrapped_model.roi_heads(images, features, proposals)
+        return tuple(detector_results[0].flatten())
+    @staticmethod
+    def get_outputs_converter(predict_net, init_net):
+        def f(batched_inputs, c2_inputs, c2_results):
+            _, im_info = c2_inputs
+            image_sizes = [[int(im[0]), int(im[1])] for im in im_info]
+            results = assemble_rcnn_outputs_by_name(image_sizes, c2_results)
+            return meta_arch.GeneralizedRCNN._postprocess(results, batched_inputs, image_sizes)
+        return f
+class Caffe2RetinaNet(Caffe2MetaArch):
+    def __init__(self, cfg, torch_model):
+        assert isinstance(torch_model, meta_arch.RetinaNet)
+        super().__init__(cfg, torch_model)
+    @mock_torch_nn_functional_interpolate()
+    def forward(self, inputs):
+        assert self.tensor_mode
+        images = self._caffe2_preprocess_image(inputs)
+        # explicitly return the images sizes to avoid removing "im_info" by ONNX
+        # since it's not used in the forward path
+        return_tensors = [images.image_sizes]
+        features = self._wrapped_model.backbone(images.tensor)
+        features = [features[f] for f in self._wrapped_model.head_in_features]
+        for i, feature_i in enumerate(features):
+            features[i] = alias(feature_i, "feature_{}".format(i), is_backward=True)
+            return_tensors.append(features[i])
+        pred_logits, pred_anchor_deltas = self._wrapped_model.head(features)
+        for i, (box_cls_i, box_delta_i) in enumerate(zip(pred_logits, pred_anchor_deltas)):
+            return_tensors.append(alias(box_cls_i, "box_cls_{}".format(i)))
+            return_tensors.append(alias(box_delta_i, "box_delta_{}".format(i)))
+        return tuple(return_tensors)
+    def encode_additional_info(self, predict_net, init_net):
+        size_divisibility = self._wrapped_model.backbone.size_divisibility
+        check_set_pb_arg(predict_net, "size_divisibility", "i", size_divisibility)
+        check_set_pb_arg(
+            predict_net, "device", "s", str.encode(str(self._wrapped_model.device), "ascii")
+        )
+        check_set_pb_arg(predict_net, "meta_architecture", "s", b"RetinaNet")
+        # Inference parameters:
+        check_set_pb_arg(
+            predict_net, "score_threshold", "f", _cast_to_f32(self._wrapped_model.test_score_thresh)
+        )
+        check_set_pb_arg(
+            predict_net, "topk_candidates", "i", self._wrapped_model.test_topk_candidates
+        )
+        check_set_pb_arg(
+            predict_net, "nms_threshold", "f", _cast_to_f32(self._wrapped_model.test_nms_thresh)
+        )
+        check_set_pb_arg(
+            predict_net,
+            "max_detections_per_image",
+            "i",
+            self._wrapped_model.max_detections_per_image,
+        )
+        check_set_pb_arg(
+            predict_net,
+            "bbox_reg_weights",
+            "floats",
+            [_cast_to_f32(w) for w in self._wrapped_model.box2box_transform.weights],
+        )
+        self._encode_anchor_generator_cfg(predict_net)
+    def _encode_anchor_generator_cfg(self, predict_net):
+        # serialize anchor_generator for future use
+        serialized_anchor_generator = io.BytesIO()
+        torch.save(self._wrapped_model.anchor_generator, serialized_anchor_generator)
+        # Ideally we can put anchor generating inside the model, then we don't
+        # need to store this information.
+        bytes = serialized_anchor_generator.getvalue()
+        check_set_pb_arg(predict_net, "serialized_anchor_generator", "s", bytes)
+    @staticmethod
+    def get_outputs_converter(predict_net, init_net):
+        self = types.SimpleNamespace()
+        serialized_anchor_generator = io.BytesIO(
+            get_pb_arg_vals(predict_net, "serialized_anchor_generator", None)
+        )
+        self.anchor_generator = torch.load(serialized_anchor_generator)
+        bbox_reg_weights = get_pb_arg_floats(predict_net, "bbox_reg_weights", None)
+        self.box2box_transform = Box2BoxTransform(weights=tuple(bbox_reg_weights))
+        self.test_score_thresh = get_pb_arg_valf(predict_net, "score_threshold", None)
+        self.test_topk_candidates = get_pb_arg_vali(predict_net, "topk_candidates", None)
+        self.test_nms_thresh = get_pb_arg_valf(predict_net, "nms_threshold", None)
+        self.max_detections_per_image = get_pb_arg_vali(
+            predict_net, "max_detections_per_image", None
+        )
+        # hack to reuse inference code from RetinaNet
+        for meth in [
+            "forward_inference",
+            "inference_single_image",
+            "_transpose_dense_predictions",
+            "_decode_multi_level_predictions",
+            "_decode_per_level_predictions",
+        ]:
+            setattr(self, meth, functools.partial(getattr(meta_arch.RetinaNet, meth), self))
+        def f(batched_inputs, c2_inputs, c2_results):
+            _, im_info = c2_inputs
+            image_sizes = [[int(im[0]), int(im[1])] for im in im_info]
+            dummy_images = ImageList(
+                torch.randn(
+                    (
+                        len(im_info),
+                        3,
+                    )
+                    + tuple(image_sizes[0])
+                ),
+                image_sizes,
+            )
+            num_features = len([x for x in c2_results.keys() if x.startswith("box_cls_")])
+            pred_logits = [c2_results["box_cls_{}".format(i)] for i in range(num_features)]
+            pred_anchor_deltas = [c2_results["box_delta_{}".format(i)] for i in range(num_features)]
+            # For each feature level, feature should have the same batch size and
+            # spatial dimension as the box_cls and box_delta.
+            dummy_features = [x.clone()[:, 0:0, :, :] for x in pred_logits]
+            # self.num_classess can be inferred
+            self.num_classes = pred_logits[0].shape[1] // (pred_anchor_deltas[0].shape[1] // 4)
+            results = self.forward_inference(
+                dummy_images, dummy_features, [pred_logits, pred_anchor_deltas]
+            )
+            return meta_arch.GeneralizedRCNN._postprocess(results, batched_inputs, image_sizes)
+        return f
+META_ARCH_CAFFE2_EXPORT_TYPE_MAP = {
+    "GeneralizedRCNN": Caffe2GeneralizedRCNN,
+    "RetinaNet": Caffe2RetinaNet,
+}
--- a/detectron2/export/caffe2_patch.py
+++ b/detectron2/export/caffe2_patch.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+import contextlib
+from unittest import mock
+import torch
+from detectron2.modeling import poolers
+from detectron2.modeling.proposal_generator import rpn
+from detectron2.modeling.roi_heads import keypoint_head, mask_head
+from detectron2.modeling.roi_heads.fast_rcnn import FastRCNNOutputLayers
+from .c10 import (
+    Caffe2Compatible,
+    Caffe2FastRCNNOutputsInference,
+    Caffe2KeypointRCNNInference,
+    Caffe2MaskRCNNInference,
+    Caffe2ROIPooler,
+    Caffe2RPN,
+    caffe2_fast_rcnn_outputs_inference,
+    caffe2_keypoint_rcnn_inference,
+    caffe2_mask_rcnn_inference,
+)
+class GenericMixin:
+    pass
+class Caffe2CompatibleConverter:
+    """
+    A GenericUpdater which implements the `create_from` interface, by modifying
+    module object and assign it with another class replaceCls.
+    """
+    def __init__(self, replaceCls):
+        self.replaceCls = replaceCls
+    def create_from(self, module):
+        # update module's class to the new class
+        assert isinstance(module, torch.nn.Module)
+        if issubclass(self.replaceCls, GenericMixin):
+            # replaceCls should act as mixin, create a new class on-the-fly
+            new_class = type(
+                "{}MixedWith{}".format(self.replaceCls.__name__, module.__class__.__name__),
+                (self.replaceCls, module.__class__),
+                {},  # {"new_method": lambda self: ...},
+            )
+            module.__class__ = new_class
+        else:
+            # replaceCls is complete class, this allow arbitrary class swap
+            module.__class__ = self.replaceCls
+        # initialize Caffe2Compatible
+        if isinstance(module, Caffe2Compatible):
+            module.tensor_mode = False
+        return module
+def patch(model, target, updater, *args, **kwargs):
+    """
+    recursively (post-order) update all modules with the target type and its
+    subclasses, make a initialization/composition/inheritance/... via the
+    updater.create_from.
+    """
+    for name, module in model.named_children():
+        model._modules[name] = patch(module, target, updater, *args, **kwargs)
+    if isinstance(model, target):
+        return updater.create_from(model, *args, **kwargs)
+    return model
+def patch_generalized_rcnn(model):
+    ccc = Caffe2CompatibleConverter
+    model = patch(model, rpn.RPN, ccc(Caffe2RPN))
+    model = patch(model, poolers.ROIPooler, ccc(Caffe2ROIPooler))
+    return model
+@contextlib.contextmanager
+def mock_fastrcnn_outputs_inference(
+    tensor_mode, check=True, box_predictor_type=FastRCNNOutputLayers
+):
+    with mock.patch.object(
+        box_predictor_type,
+        "inference",
+        autospec=True,
+        side_effect=Caffe2FastRCNNOutputsInference(tensor_mode),
+    ) as mocked_func:
+        yield
+    if check:
+        assert mocked_func.call_count > 0
+@contextlib.contextmanager
+def mock_mask_rcnn_inference(tensor_mode, patched_module, check=True):
+    with mock.patch(
+        "{}.mask_rcnn_inference".format(patched_module), side_effect=Caffe2MaskRCNNInference()
+    ) as mocked_func:
+        yield
+    if check:
+        assert mocked_func.call_count > 0
+@contextlib.contextmanager
+def mock_keypoint_rcnn_inference(tensor_mode, patched_module, use_heatmap_max_keypoint, check=True):
+    with mock.patch(
+        "{}.keypoint_rcnn_inference".format(patched_module),
+        side_effect=Caffe2KeypointRCNNInference(use_heatmap_max_keypoint),
+    ) as mocked_func:
+        yield
+    if check:
+        assert mocked_func.call_count > 0
+class ROIHeadsPatcher:
+    def __init__(self, heads, use_heatmap_max_keypoint):
+        self.heads = heads
+        self.use_heatmap_max_keypoint = use_heatmap_max_keypoint
+        self.previous_patched = {}
+    @contextlib.contextmanager
+    def mock_roi_heads(self, tensor_mode=True):
+        """
+        Patching several inference functions inside ROIHeads and its subclasses
+        Args:
+            tensor_mode (bool): whether the inputs/outputs are caffe2's tensor
+                format or not. Default to True.
+        """
+        # NOTE: this requries the `keypoint_rcnn_inference` and `mask_rcnn_inference`
+        # are called inside the same file as BaseXxxHead due to using mock.patch.
+        kpt_heads_mod = keypoint_head.BaseKeypointRCNNHead.__module__
+        mask_head_mod = mask_head.BaseMaskRCNNHead.__module__
+        mock_ctx_managers = [
+            mock_fastrcnn_outputs_inference(
+                tensor_mode=tensor_mode,
+                check=True,
+                box_predictor_type=type(self.heads.box_predictor),
+            )
+        ]
+        if getattr(self.heads, "keypoint_on", False):
+            mock_ctx_managers += [
+                mock_keypoint_rcnn_inference(
+                    tensor_mode, kpt_heads_mod, self.use_heatmap_max_keypoint
+                )
+            ]
+        if getattr(self.heads, "mask_on", False):
+            mock_ctx_managers += [mock_mask_rcnn_inference(tensor_mode, mask_head_mod)]
+        with contextlib.ExitStack() as stack:  # python 3.3+
+            for mgr in mock_ctx_managers:
+                stack.enter_context(mgr)
+            yield
+    def patch_roi_heads(self, tensor_mode=True):
+        self.previous_patched["box_predictor"] = self.heads.box_predictor.inference
+        self.previous_patched["keypoint_rcnn"] = keypoint_head.keypoint_rcnn_inference
+        self.previous_patched["mask_rcnn"] = mask_head.mask_rcnn_inference
+        def patched_fastrcnn_outputs_inference(predictions, proposal):
+            return caffe2_fast_rcnn_outputs_inference(
+                True, self.heads.box_predictor, predictions, proposal
+            )
+        self.heads.box_predictor.inference = patched_fastrcnn_outputs_inference
+        if getattr(self.heads, "keypoint_on", False):
+            def patched_keypoint_rcnn_inference(pred_keypoint_logits, pred_instances):
+                return caffe2_keypoint_rcnn_inference(
+                    self.use_heatmap_max_keypoint, pred_keypoint_logits, pred_instances
+                )
+            keypoint_head.keypoint_rcnn_inference = patched_keypoint_rcnn_inference
+        if getattr(self.heads, "mask_on", False):
+            def patched_mask_rcnn_inference(pred_mask_logits, pred_instances):
+                return caffe2_mask_rcnn_inference(pred_mask_logits, pred_instances)
+            mask_head.mask_rcnn_inference = patched_mask_rcnn_inference
+    def unpatch_roi_heads(self):
+        self.heads.box_predictor.inference = self.previous_patched["box_predictor"]
+        keypoint_head.keypoint_rcnn_inference = self.previous_patched["keypoint_rcnn"]
+        mask_head.mask_rcnn_inference = self.previous_patched["mask_rcnn"]
--- a/detectron2/export/flatten.py
+++ b/detectron2/export/flatten.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+import collections
+from dataclasses import dataclass
+from typing import Callable, List, Optional, Tuple
+import torch
+from torch import nn
+from detectron2.structures import Boxes, Instances, ROIMasks
+from detectron2.utils.registry import _convert_target_to_string, locate
+from .torchscript_patch import patch_builtin_len
+@dataclass
+class Schema:
+    """
+    A Schema defines how to flatten a possibly hierarchical object into tuple of
+    primitive objects, so it can be used as inputs/outputs of PyTorch's tracing.
+    PyTorch does not support tracing a function that produces rich output
+    structures (e.g. dict, Instances, Boxes). To trace such a function, we
+    flatten the rich object into tuple of tensors, and return this tuple of tensors
+    instead. Meanwhile, we also need to know how to "rebuild" the original object
+    from the flattened results, so we can evaluate the flattened results.
+    A Schema defines how to flatten an object, and while flattening it, it records
+    necessary schemas so that the object can be rebuilt using the flattened outputs.
+    The flattened object and the schema object is returned by ``.flatten`` classmethod.
+    Then the original object can be rebuilt with the ``__call__`` method of schema.
+    A Schema is a dataclass that can be serialized easily.
+    """
+    # inspired by FetchMapper in tensorflow/python/client/session.py
+    @classmethod
+    def flatten(cls, obj):
+        raise NotImplementedError
+    def __call__(self, values):
+        raise NotImplementedError
+    @staticmethod
+    def _concat(values):
+        ret = ()
+        sizes = []
+        for v in values:
+            assert isinstance(v, tuple), "Flattened results must be a tuple"
+            ret = ret + v
+            sizes.append(len(v))
+        return ret, sizes
+    @staticmethod
+    def _split(values, sizes):
+        if len(sizes):
+            expected_len = sum(sizes)
+            assert (
+                len(values) == expected_len
+            ), f"Values has length {len(values)} but expect length {expected_len}."
+        ret = []
+        for k in range(len(sizes)):
+            begin, end = sum(sizes[:k]), sum(sizes[: k + 1])
+            ret.append(values[begin:end])
+        return ret
+@dataclass
+class ListSchema(Schema):
+    schemas: List[Schema]  # the schemas that define how to flatten each element in the list
+    sizes: List[int]  # the flattened length of each element
+    def __call__(self, values):
+        values = self._split(values, self.sizes)
+        if len(values) != len(self.schemas):
+            raise ValueError(
+                f"Values has length {len(values)} but schemas " f"has length {len(self.schemas)}!"
+            )
+        values = [m(v) for m, v in zip(self.schemas, values)]
+        return list(values)
+    @classmethod
+    def flatten(cls, obj):
+        res = [flatten_to_tuple(k) for k in obj]
+        values, sizes = cls._concat([k[0] for k in res])
+        return values, cls([k[1] for k in res], sizes)
+@dataclass
+class TupleSchema(ListSchema):
+    def __call__(self, values):
+        return tuple(super().__call__(values))
+@dataclass
+class IdentitySchema(Schema):
+    def __call__(self, values):
+        return values[0]
+    @classmethod
+    def flatten(cls, obj):
+        return (obj,), cls()
+@dataclass
+class DictSchema(ListSchema):
+    keys: List[str]
+    def __call__(self, values):
+        values = super().__call__(values)
+        return dict(zip(self.keys, values))
+    @classmethod
+    def flatten(cls, obj):
+        for k in obj.keys():
+            if not isinstance(k, str):
+                raise KeyError("Only support flattening dictionaries if keys are str.")
+        keys = sorted(obj.keys())
+        values = [obj[k] for k in keys]
+        ret, schema = ListSchema.flatten(values)
+        return ret, cls(schema.schemas, schema.sizes, keys)
+@dataclass
+class InstancesSchema(DictSchema):
+    def __call__(self, values):
+        image_size, fields = values[-1], values[:-1]
+        fields = super().__call__(fields)
+        return Instances(image_size, **fields)
+    @classmethod
+    def flatten(cls, obj):
+        ret, schema = super().flatten(obj.get_fields())
+        size = obj.image_size
+        if not isinstance(size, torch.Tensor):
+            size = torch.tensor(size)
+        return ret + (size,), schema
+@dataclass
+class TensorWrapSchema(Schema):
+    """
+    For classes that are simple wrapper of tensors, e.g.
+    Boxes, RotatedBoxes, BitMasks
+    """
+    class_name: str
+    def __call__(self, values):
+        return locate(self.class_name)(values[0])
+    @classmethod
+    def flatten(cls, obj):
+        return (obj.tensor,), cls(_convert_target_to_string(type(obj)))
+# if more custom structures needed in the future, can allow
+# passing in extra schemas for custom types
+def flatten_to_tuple(obj):
+    """
+    Flatten an object so it can be used for PyTorch tracing.
+    Also returns how to rebuild the original object from the flattened outputs.
+    Returns:
+        res (tuple): the flattened results that can be used as tracing outputs
+        schema: an object with a ``__call__`` method such that ``schema(res) == obj``.
+             It is a pure dataclass that can be serialized.
+    """
+    schemas = [
+        ((str, bytes), IdentitySchema),
+        (list, ListSchema),
+        (tuple, TupleSchema),
+        (collections.abc.Mapping, DictSchema),
+        (Instances, InstancesSchema),
+        ((Boxes, ROIMasks), TensorWrapSchema),
+    ]
+    for klass, schema in schemas:
+        if isinstance(obj, klass):
+            F = schema
+            break
+    else:
+        F = IdentitySchema
+    return F.flatten(obj)
+class TracingAdapter(nn.Module):
+    """
+    A model may take rich input/output format (e.g. dict or custom classes),
+    but `torch.jit.trace` requires tuple of tensors as input/output.
+    This adapter flattens input/output format of a model so it becomes traceable.
+    It also records the necessary schema to rebuild model's inputs/outputs from flattened
+    inputs/outputs.
+    Example:
+    ::
+        outputs = model(inputs)   # inputs/outputs may be rich structure
+        adapter = TracingAdapter(model, inputs)
+        # can now trace the model, with adapter.flattened_inputs, or another
+        # tuple of tensors with the same length and meaning
+        traced = torch.jit.trace(adapter, adapter.flattened_inputs)
+        # traced model can only produce flattened outputs (tuple of tensors)
+        flattened_outputs = traced(*adapter.flattened_inputs)
+        # adapter knows the schema to convert it back (new_outputs == outputs)
+        new_outputs = adapter.outputs_schema(flattened_outputs)
+    """
+    flattened_inputs: Tuple[torch.Tensor] = None
+    """
+    Flattened version of inputs given to this class's constructor.
+    """
+    inputs_schema: Schema = None
+    """
+    Schema of the inputs given to this class's constructor.
+    """
+    outputs_schema: Schema = None
+    """
+    Schema of the output produced by calling the given model with inputs.
+    """
+    def __init__(
+        self,
+        model: nn.Module,
+        inputs,
+        inference_func: Optional[Callable] = None,
+        allow_non_tensor: bool = False,
+    ):
+        """
+        Args:
+            model: an nn.Module
+            inputs: An input argument or a tuple of input arguments used to call model.
+                After flattening, it has to only consist of tensors.
+            inference_func: a callable that takes (model, *inputs), calls the
+                model with inputs, and return outputs. By default it
+                is ``lambda model, *inputs: model(*inputs)``. Can be override
+                if you need to call the model differently.
+            allow_non_tensor: allow inputs/outputs to contain non-tensor objects.
+                This option will filter out non-tensor objects to make the
+                model traceable, but ``inputs_schema``/``outputs_schema`` cannot be
+                used anymore because inputs/outputs cannot be rebuilt from pure tensors.
+                This is useful when you're only interested in the single trace of
+                execution (e.g. for flop count), but not interested in
+                generalizing the traced graph to new inputs.
+        """
+        super().__init__()
+        if isinstance(model, (nn.parallel.distributed.DistributedDataParallel, nn.DataParallel)):
+            model = model.module
+        self.model = model
+        if not isinstance(inputs, tuple):
+            inputs = (inputs,)
+        self.inputs = inputs
+        self.allow_non_tensor = allow_non_tensor
+        if inference_func is None:
+            inference_func = lambda model, *inputs: model(*inputs)  # noqa
+        self.inference_func = inference_func
+        self.flattened_inputs, self.inputs_schema = flatten_to_tuple(inputs)
+        if all(isinstance(x, torch.Tensor) for x in self.flattened_inputs):
+            return
+        if self.allow_non_tensor:
+            self.flattened_inputs = tuple(
+                [x for x in self.flattened_inputs if isinstance(x, torch.Tensor)]
+            )
+            self.inputs_schema = None
+        else:
+            for input in self.flattened_inputs:
+                if not isinstance(input, torch.Tensor):
+                    raise ValueError(
+                        "Inputs for tracing must only contain tensors. "
+                        f"Got a {type(input)} instead."
+                    )
+    def forward(self, *args: torch.Tensor):
+        with torch.no_grad(), patch_builtin_len():
+            if self.inputs_schema is not None:
+                inputs_orig_format = self.inputs_schema(args)
+            else:
+                if len(args) != len(self.flattened_inputs) or any(
+                    x is not y for x, y in zip(args, self.flattened_inputs)
+                ):
+                    raise ValueError(
+                        "TracingAdapter does not contain valid inputs_schema."
+                        " So it cannot generalize to other inputs and must be"
+                        " traced with `.flattened_inputs`."
+                    )
+                inputs_orig_format = self.inputs
+            outputs = self.inference_func(self.model, *inputs_orig_format)
+            flattened_outputs, schema = flatten_to_tuple(outputs)
+            flattened_output_tensors = tuple(
+                [x for x in flattened_outputs if isinstance(x, torch.Tensor)]
+            )
+            if len(flattened_output_tensors) < len(flattened_outputs):
+                if self.allow_non_tensor:
+                    flattened_outputs = flattened_output_tensors
+                    self.outputs_schema = None
+                else:
+                    raise ValueError(
+                        "Model cannot be traced because some model outputs "
+                        "cannot flatten to tensors."
+                    )
+            else:  # schema is valid
+                if self.outputs_schema is None:
+                    self.outputs_schema = schema
+                else:
+                    assert self.outputs_schema == schema, (
+                        "Model should always return outputs with the same "
+                        "structure so it can be traced!"
+                    )
+            return flattened_outputs
+    def _create_wrapper(self, traced_model):
+        """
+        Return a function that has an input/output interface the same as the
+        original model, but it calls the given traced model under the hood.
+        """
+        def forward(*args):
+            flattened_inputs, _ = flatten_to_tuple(args)
+            flattened_outputs = traced_model(*flattened_inputs)
+            return self.outputs_schema(flattened_outputs)
+        return forward
--- a/detectron2/export/shared.py
+++ b/detectron2/export/shared.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+import collections
+import copy
+import functools
+import logging
+import numpy as np
+import os
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from unittest import mock
+import caffe2.python.utils as putils
+import torch
+import torch.nn.functional as F
+from caffe2.proto import caffe2_pb2
+from caffe2.python import core, net_drawer, workspace
+from torch.nn.functional import interpolate as interp
+logger = logging.getLogger(__name__)
+# ==== torch/utils_toffee/cast.py =======================================
+def to_device(t, device_str):
+    """
+    This function is a replacement of .to(another_device) such that it allows the
+    casting to be traced properly by explicitly calling the underlying copy ops.
+    It also avoids introducing unncessary op when casting to the same device.
+    """
+    src = t.device
+    dst = torch.device(device_str)
+    if src == dst:
+        return t
+    elif src.type == "cuda" and dst.type == "cpu":
+        return torch.ops._caffe2.CopyGPUToCPU(t)
+    elif src.type == "cpu" and dst.type == "cuda":
+        return torch.ops._caffe2.CopyCPUToGPU(t)
+    else:
+        raise RuntimeError("Can't cast tensor from device {} to device {}".format(src, dst))
+# ==== torch/utils_toffee/interpolate.py =======================================
+# Note: borrowed from vision/detection/fair/detectron/detectron/modeling/detector.py
+def BilinearInterpolation(tensor_in, up_scale):
+    assert up_scale % 2 == 0, "Scale should be even"
+    def upsample_filt(size):
+        factor = (size + 1) // 2
+        if size % 2 == 1:
+            center = factor - 1
+        else:
+            center = factor - 0.5
+        og = np.ogrid[:size, :size]
+        return (1 - abs(og[0] - center) / factor) * (1 - abs(og[1] - center) / factor)
+    kernel_size = int(up_scale) * 2
+    bil_filt = upsample_filt(kernel_size)
+    dim = int(tensor_in.shape[1])
+    kernel = np.zeros((dim, dim, kernel_size, kernel_size), dtype=np.float32)
+    kernel[range(dim), range(dim), :, :] = bil_filt
+    tensor_out = F.conv_transpose2d(
+        tensor_in,
+        weight=to_device(torch.Tensor(kernel), tensor_in.device),
+        bias=None,
+        stride=int(up_scale),
+        padding=int(up_scale / 2),
+    )
+    return tensor_out
+# NOTE: ONNX is incompatible with traced torch.nn.functional.interpolate if
+# using dynamic `scale_factor` rather than static `size`. (T43166860)
+# NOTE: Caffe2 Int8 conversion might not be able to quantize `size` properly.
+def onnx_compatibale_interpolate(
+    input, size=None, scale_factor=None, mode="nearest", align_corners=None
+):
+    # NOTE: The input dimensions are interpreted in the form:
+    # `mini-batch x channels x [optional depth] x [optional height] x width`.
+    if size is None and scale_factor is not None:
+        if input.dim() == 4:
+            if isinstance(scale_factor, (int, float)):
+                height_scale, width_scale = (scale_factor, scale_factor)
+            else:
+                assert isinstance(scale_factor, (tuple, list))
+                assert len(scale_factor) == 2
+                height_scale, width_scale = scale_factor
+            assert not align_corners, "No matching C2 op for align_corners == True"
+            if mode == "nearest":
+                return torch.ops._caffe2.ResizeNearest(
+                    input, order="NCHW", width_scale=width_scale, height_scale=height_scale
+                )
+            elif mode == "bilinear":
+                logger.warning(
+                    "Use F.conv_transpose2d for bilinear interpolate"
+                    " because there's no such C2 op, this may cause significant"
+                    " slowdown and the boundary pixels won't be as same as"
+                    " using F.interpolate due to padding."
+                )
+                assert height_scale == width_scale
+                return BilinearInterpolation(input, up_scale=height_scale)
+        logger.warning("Output size is not static, it might cause ONNX conversion issue")
+    return interp(input, size, scale_factor, mode, align_corners)
+def mock_torch_nn_functional_interpolate():
+    def decorator(func):
+        @functools.wraps(func)
+        def _mock_torch_nn_functional_interpolate(*args, **kwargs):
+            if torch.onnx.is_in_onnx_export():
+                with mock.patch(
+                    "torch.nn.functional.interpolate", side_effect=onnx_compatibale_interpolate
+                ):
+                    return func(*args, **kwargs)
+            else:
+                return func(*args, **kwargs)
+        return _mock_torch_nn_functional_interpolate
+    return decorator
+# ==== torch/utils_caffe2/ws_utils.py ==========================================
+class ScopedWS:
+    def __init__(self, ws_name, is_reset, is_cleanup=False):
+        self.ws_name = ws_name
+        self.is_reset = is_reset
+        self.is_cleanup = is_cleanup
+        self.org_ws = ""
+    def __enter__(self):
+        self.org_ws = workspace.CurrentWorkspace()
+        if self.ws_name is not None:
+            workspace.SwitchWorkspace(self.ws_name, True)
+        if self.is_reset:
+            workspace.ResetWorkspace()
+        return workspace
+    def __exit__(self, *args):
+        if self.is_cleanup:
+            workspace.ResetWorkspace()
+        if self.ws_name is not None:
+            workspace.SwitchWorkspace(self.org_ws)
+def fetch_any_blob(name):
+    bb = None
+    try:
+        bb = workspace.FetchBlob(name)
+    except TypeError:
+        bb = workspace.FetchInt8Blob(name)
+    except Exception as e:
+        logger.error("Get blob {} error: {}".format(name, e))
+    return bb
+# ==== torch/utils_caffe2/protobuf.py ==========================================
+def get_pb_arg(pb, arg_name):
+    for x in pb.arg:
+        if x.name == arg_name:
+            return x
+    return None
+def get_pb_arg_valf(pb, arg_name, default_val):
+    arg = get_pb_arg(pb, arg_name)
+    return arg.f if arg is not None else default_val
+def get_pb_arg_floats(pb, arg_name, default_val):
+    arg = get_pb_arg(pb, arg_name)
+    return list(map(float, arg.floats)) if arg is not None else default_val
+def get_pb_arg_ints(pb, arg_name, default_val):
+    arg = get_pb_arg(pb, arg_name)
+    return list(map(int, arg.ints)) if arg is not None else default_val
+def get_pb_arg_vali(pb, arg_name, default_val):
+    arg = get_pb_arg(pb, arg_name)
+    return arg.i if arg is not None else default_val
+def get_pb_arg_vals(pb, arg_name, default_val):
+    arg = get_pb_arg(pb, arg_name)
+    return arg.s if arg is not None else default_val
+def get_pb_arg_valstrings(pb, arg_name, default_val):
+    arg = get_pb_arg(pb, arg_name)
+    return list(arg.strings) if arg is not None else default_val
+def check_set_pb_arg(pb, arg_name, arg_attr, arg_value, allow_override=False):
+    arg = get_pb_arg(pb, arg_name)
+    if arg is None:
+        arg = putils.MakeArgument(arg_name, arg_value)
+        assert hasattr(arg, arg_attr)
+        pb.arg.extend([arg])
+    if allow_override and getattr(arg, arg_attr) != arg_value:
+        logger.warning(
+            "Override argument {}: {} -> {}".format(arg_name, getattr(arg, arg_attr), arg_value)
+        )
+        setattr(arg, arg_attr, arg_value)
+    else:
+        assert arg is not None
+        assert getattr(arg, arg_attr) == arg_value, "Existing value {}, new value {}".format(
+            getattr(arg, arg_attr), arg_value
+        )
+def _create_const_fill_op_from_numpy(name, tensor, device_option=None):
+    assert type(tensor) is np.ndarray
+    kTypeNameMapper = {
+        np.dtype("float32"): "GivenTensorFill",
+        np.dtype("int32"): "GivenTensorIntFill",
+        np.dtype("int64"): "GivenTensorInt64Fill",
+        np.dtype("uint8"): "GivenTensorStringFill",
+    }
+    args_dict = {}
+    if tensor.dtype == np.dtype("uint8"):
+        args_dict.update({"values": [str(tensor.data)], "shape": [1]})
+    else:
+        args_dict.update({"values": tensor, "shape": tensor.shape})
+    if device_option is not None:
+        args_dict["device_option"] = device_option
+    return core.CreateOperator(kTypeNameMapper[tensor.dtype], [], [name], **args_dict)
+def _create_const_fill_op_from_c2_int8_tensor(name, int8_tensor):
+    assert type(int8_tensor) is workspace.Int8Tensor
+    kTypeNameMapper = {
+        np.dtype("int32"): "Int8GivenIntTensorFill",
+        np.dtype("uint8"): "Int8GivenTensorFill",
+    }
+    tensor = int8_tensor.data
+    assert tensor.dtype in [np.dtype("uint8"), np.dtype("int32")]
+    values = tensor.tobytes() if tensor.dtype == np.dtype("uint8") else tensor
+    return core.CreateOperator(
+        kTypeNameMapper[tensor.dtype],
+        [],
+        [name],
+        values=values,
+        shape=tensor.shape,
+        Y_scale=int8_tensor.scale,
+        Y_zero_point=int8_tensor.zero_point,
+    )
+def create_const_fill_op(
+    name: str,
+    blob: Union[np.ndarray, workspace.Int8Tensor],
+    device_option: Optional[caffe2_pb2.DeviceOption] = None,
+) -> caffe2_pb2.OperatorDef:
+    """
+    Given a blob object, return the Caffe2 operator that creates this blob
+    as constant. Currently support NumPy tensor and Caffe2 Int8Tensor.
+    """
+    tensor_type = type(blob)
+    assert tensor_type in [
+        np.ndarray,
+        workspace.Int8Tensor,
+    ], 'Error when creating const fill op for "{}", unsupported blob type: {}'.format(
+        name, type(blob)
+    )
+    if tensor_type == np.ndarray:
+        return _create_const_fill_op_from_numpy(name, blob, device_option)
+    elif tensor_type == workspace.Int8Tensor:
+        assert device_option is None
+        return _create_const_fill_op_from_c2_int8_tensor(name, blob)
+def construct_init_net_from_params(
+    params: Dict[str, Any], device_options: Optional[Dict[str, caffe2_pb2.DeviceOption]] = None
+) -> caffe2_pb2.NetDef:
+    """
+    Construct the init_net from params dictionary
+    """
+    init_net = caffe2_pb2.NetDef()
+    device_options = device_options or {}
+    for name, blob in params.items():
+        if isinstance(blob, str):
+            logger.warning(
+                (
+                    "Blob {} with type {} is not supported in generating init net,"
+                    " skipped.".format(name, type(blob))
+                )
+            )
+            continue
+        init_net.op.extend(
+            [create_const_fill_op(name, blob, device_option=device_options.get(name, None))]
+        )
+        init_net.external_output.append(name)
+    return init_net
+def get_producer_map(ssa):
+    """
+    Return dict from versioned blob to (i, j),
+        where i is index of producer op, j is the index of output of that op.
+    """
+    producer_map = {}
+    for i in range(len(ssa)):
+        outputs = ssa[i][1]
+        for j, outp in enumerate(outputs):
+            producer_map[outp] = (i, j)
+    return producer_map
+def get_consumer_map(ssa):
+    """
+    Return dict from versioned blob to list of (i, j),
+        where i is index of consumer op, j is the index of input of that op.
+    """
+    consumer_map = collections.defaultdict(list)
+    for i in range(len(ssa)):
+        inputs = ssa[i][0]
+        for j, inp in enumerate(inputs):
+            consumer_map[inp].append((i, j))
+    return consumer_map
+def get_params_from_init_net(
+    init_net: caffe2_pb2.NetDef,
+) -> [Dict[str, Any], Dict[str, caffe2_pb2.DeviceOption]]:
+    """
+    Take the output blobs from init_net by running it.
+    Outputs:
+        params: dict from blob name to numpy array
+        device_options: dict from blob name to the device option of its creating op
+    """
+    # NOTE: this assumes that the params is determined by producer op with the
+    # only exception be CopyGPUToCPU which is CUDA op but returns CPU tensor.
+    def _get_device_option(producer_op):
+        if producer_op.type == "CopyGPUToCPU":
+            return caffe2_pb2.DeviceOption()
+        else:
+            return producer_op.device_option
+    with ScopedWS("__get_params_from_init_net__", is_reset=True, is_cleanup=True) as ws:
+        ws.RunNetOnce(init_net)
+        params = {b: fetch_any_blob(b) for b in init_net.external_output}
+    ssa, versions = core.get_ssa(init_net)
+    producer_map = get_producer_map(ssa)
+    device_options = {
+        b: _get_device_option(init_net.op[producer_map[(b, versions[b])][0]])
+        for b in init_net.external_output
+    }
+    return params, device_options
+def _updater_raise(op, input_types, output_types):
+    raise RuntimeError(
+        "Failed to apply updater for op {} given input_types {} and"
+        " output_types {}".format(op, input_types, output_types)
+    )
+def _generic_status_identifier(
+    predict_net: caffe2_pb2.NetDef,
+    status_updater: Callable,
+    known_status: Dict[Tuple[str, int], Any],
+) -> Dict[Tuple[str, int], Any]:
+    """
+    Statically infer the status of each blob, the status can be such as device type
+        (CPU/GPU), layout (NCHW/NHWC), data type (float32/int8), etc. "Blob" here
+        is versioned blob (Tuple[str, int]) in the format compatible with ssa.
+    Inputs:
+        predict_net: the caffe2 network
+        status_updater: a callable, given an op and the status of its input/output,
+            it returns the updated status of input/output. `None` is used for
+            representing unknown status.
+        known_status: a dict containing known status, used as initialization.
+    Outputs:
+        A dict mapping from versioned blob to its status
+    """
+    ssa, versions = core.get_ssa(predict_net)
+    versioned_ext_input = [(b, 0) for b in predict_net.external_input]
+    versioned_ext_output = [(b, versions[b]) for b in predict_net.external_output]
+    all_versioned_blobs = set().union(*[set(x[0] + x[1]) for x in ssa])
+    allowed_vbs = all_versioned_blobs.union(versioned_ext_input).union(versioned_ext_output)
+    assert all(k in allowed_vbs for k in known_status)
+    assert all(v is not None for v in known_status.values())
+    _known_status = copy.deepcopy(known_status)
+    def _check_and_update(key, value):
+        assert value is not None
+        if key in _known_status:
+            if not _known_status[key] == value:
+                raise RuntimeError(
+                    "Confilict status for {}, existing status {}, new status {}".format(
+                        key, _known_status[key], value
+                    )
+                )
+        _known_status[key] = value
+    def _update_i(op, ssa_i):
+        versioned_inputs = ssa_i[0]
+        versioned_outputs = ssa_i[1]
+        inputs_status = [_known_status.get(b, None) for b in versioned_inputs]
+        outputs_status = [_known_status.get(b, None) for b in versioned_outputs]
+        new_inputs_status, new_outputs_status = status_updater(op, inputs_status, outputs_status)
+        for versioned_blob, status in zip(
+            versioned_inputs + versioned_outputs, new_inputs_status + new_outputs_status
+        ):
+            if status is not None:
+                _check_and_update(versioned_blob, status)
+    for op, ssa_i in zip(predict_net.op, ssa):
+        _update_i(op, ssa_i)
+    for op, ssa_i in zip(reversed(predict_net.op), reversed(ssa)):
+        _update_i(op, ssa_i)
+    # NOTE: This strictly checks all the blob from predict_net must be assgined
+    # a known status. However sometimes it's impossible (eg. having deadend op),
+    # we may relax this constraint if
+    for k in all_versioned_blobs:
+        if k not in _known_status:
+            raise NotImplementedError(
+                "Can not infer the status for {}. Currently only support the case where"
+                " a single forward and backward pass can identify status for all blobs.".format(k)
+            )
+    return _known_status
+def infer_device_type(
+    predict_net: caffe2_pb2.NetDef,
+    known_status: Dict[Tuple[str, int], Any],
+    device_name_style: str = "caffe2",
+) -> Dict[Tuple[str, int], str]:
+    """Return the device type ("cpu" or "gpu"/"cuda") of each (versioned) blob"""
+    assert device_name_style in ["caffe2", "pytorch"]
+    _CPU_STR = "cpu"
+    _GPU_STR = "gpu" if device_name_style == "caffe2" else "cuda"
+    def _copy_cpu_to_gpu_updater(op, input_types, output_types):
+        if input_types[0] == _GPU_STR or output_types[0] == _CPU_STR:
+            _updater_raise(op, input_types, output_types)
+        return ([_CPU_STR], [_GPU_STR])
+    def _copy_gpu_to_cpu_updater(op, input_types, output_types):
+        if input_types[0] == _CPU_STR or output_types[0] == _GPU_STR:
+            _updater_raise(op, input_types, output_types)
+        return ([_GPU_STR], [_CPU_STR])
+    def _other_ops_updater(op, input_types, output_types):
+        non_none_types = [x for x in input_types + output_types if x is not None]
+        if len(non_none_types) > 0:
+            the_type = non_none_types[0]
+            if not all(x == the_type for x in non_none_types):
+                _updater_raise(op, input_types, output_types)
+        else:
+            the_type = None
+        return ([the_type for _ in op.input], [the_type for _ in op.output])
+    def _device_updater(op, *args, **kwargs):
+        return {
+            "CopyCPUToGPU": _copy_cpu_to_gpu_updater,
+            "CopyGPUToCPU": _copy_gpu_to_cpu_updater,
+        }.get(op.type, _other_ops_updater)(op, *args, **kwargs)
+    return _generic_status_identifier(predict_net, _device_updater, known_status)
+# ==== torch/utils_caffe2/vis.py ===============================================
+def _modify_blob_names(ops, blob_rename_f):
+    ret = []
+    def _replace_list(blob_list, replaced_list):
+        del blob_list[:]
+        blob_list.extend(replaced_list)
+    for x in ops:
+        cur = copy.deepcopy(x)
+        _replace_list(cur.input, list(map(blob_rename_f, cur.input)))
+        _replace_list(cur.output, list(map(blob_rename_f, cur.output)))
+        ret.append(cur)
+    return ret
+def _rename_blob(name, blob_sizes, blob_ranges):
+    def _list_to_str(bsize):
+        ret = ", ".join([str(x) for x in bsize])
+        ret = "[" + ret + "]"
+        return ret
+    ret = name
+    if blob_sizes is not None and name in blob_sizes:
+        ret += "\n" + _list_to_str(blob_sizes[name])
+    if blob_ranges is not None and name in blob_ranges:
+        ret += "\n" + _list_to_str(blob_ranges[name])
+    return ret
+# graph_name could not contain word 'graph'
+def save_graph(net, file_name, graph_name="net", op_only=True, blob_sizes=None, blob_ranges=None):
+    blob_rename_f = functools.partial(_rename_blob, blob_sizes=blob_sizes, blob_ranges=blob_ranges)
+    return save_graph_base(net, file_name, graph_name, op_only, blob_rename_f)
+def save_graph_base(net, file_name, graph_name="net", op_only=True, blob_rename_func=None):
+    graph = None
+    ops = net.op
+    if blob_rename_func is not None:
+        ops = _modify_blob_names(ops, blob_rename_func)
+    if not op_only:
+        graph = net_drawer.GetPydotGraph(ops, graph_name, rankdir="TB")
+    else:
+        graph = net_drawer.GetPydotGraphMinimal(
+            ops, graph_name, rankdir="TB", minimal_dependency=True
+        )
+    try:
+        par_dir = os.path.dirname(file_name)
+        if not os.path.exists(par_dir):
+            os.makedirs(par_dir)
+        format = os.path.splitext(os.path.basename(file_name))[-1]
+        if format == ".png":
+            graph.write_png(file_name)
+        elif format == ".pdf":
+            graph.write_pdf(file_name)
+        elif format == ".svg":
+            graph.write_svg(file_name)
+        else:
+            print("Incorrect format {}".format(format))
+    except Exception as e:
+        print("Error when writing graph to image {}".format(e))
+    return graph
+# ==== torch/utils_toffee/aten_to_caffe2.py ====================================
+def group_norm_replace_aten_with_caffe2(predict_net: caffe2_pb2.NetDef):
+    """
+    For ONNX exported model, GroupNorm will be represented as ATen op,
+        this can be a drop in replacement from ATen to GroupNorm
+    """
+    count = 0
+    for op in predict_net.op:
+        if op.type == "ATen":
+            op_name = get_pb_arg_vals(op, "operator", None)  # return byte in py3
+            if op_name and op_name.decode() == "group_norm":
+                op.arg.remove(get_pb_arg(op, "operator"))
+                if get_pb_arg_vali(op, "cudnn_enabled", None):
+                    op.arg.remove(get_pb_arg(op, "cudnn_enabled"))
+                num_groups = get_pb_arg_vali(op, "num_groups", None)
+                if num_groups is not None:
+                    op.arg.remove(get_pb_arg(op, "num_groups"))
+                    check_set_pb_arg(op, "group", "i", num_groups)
+                op.type = "GroupNorm"
+                count += 1
+    if count > 1:
+        logger.info("Replaced {} ATen operator to GroupNormOp".format(count))
+# ==== torch/utils_toffee/alias.py =============================================
+def alias(x, name, is_backward=False):
+    if not torch.onnx.is_in_onnx_export():
+        return x
+    assert isinstance(x, torch.Tensor)
+    return torch.ops._caffe2.AliasWithName(x, name, is_backward=is_backward)
+def fuse_alias_placeholder(predict_net, init_net):
+    """Remove AliasWithName placeholder and rename the input/output of it"""
+    # First we finish all the re-naming
+    for i, op in enumerate(predict_net.op):
+        if op.type == "AliasWithName":
+            assert len(op.input) == 1
+            assert len(op.output) == 1
+            name = get_pb_arg_vals(op, "name", None).decode()
+            is_backward = bool(get_pb_arg_vali(op, "is_backward", 0))
+            rename_op_input(predict_net, init_net, i, 0, name, from_producer=is_backward)
+            rename_op_output(predict_net, i, 0, name)
+    # Remove AliasWithName, should be very safe since it's a non-op
+    new_ops = []
+    for op in predict_net.op:
+        if op.type != "AliasWithName":
+            new_ops.append(op)
+        else:
+            # safety check
+            assert op.input == op.output
+            assert op.input[0] == op.arg[0].s.decode()
+    del predict_net.op[:]
+    predict_net.op.extend(new_ops)
+# ==== torch/utils_caffe2/graph_transform.py ===================================
+class IllegalGraphTransformError(ValueError):
+    """When a graph transform function call can't be executed."""
+def _rename_versioned_blob_in_proto(
+    proto: caffe2_pb2.NetDef,
+    old_name: str,
+    new_name: str,
+    version: int,
+    ssa: List[Tuple[List[Tuple[str, int]], List[Tuple[str, int]]]],
+    start_versions: Dict[str, int],
+    end_versions: Dict[str, int],
+):
+    """In given proto, rename all blobs with matched version"""
+    # Operater list
+    for op, i_th_ssa in zip(proto.op, ssa):
+        versioned_inputs, versioned_outputs = i_th_ssa
+        for i in range(len(op.input)):
+            if versioned_inputs[i] == (old_name, version):
+                op.input[i] = new_name
+        for i in range(len(op.output)):
+            if versioned_outputs[i] == (old_name, version):
+                op.output[i] = new_name
+    # external_input
+    if start_versions.get(old_name, 0) == version:
+        for i in range(len(proto.external_input)):
+            if proto.external_input[i] == old_name:
+                proto.external_input[i] = new_name
+    # external_output
+    if end_versions.get(old_name, 0) == version:
+        for i in range(len(proto.external_output)):
+            if proto.external_output[i] == old_name:
+                proto.external_output[i] = new_name
+def rename_op_input(
+    predict_net: caffe2_pb2.NetDef,
+    init_net: caffe2_pb2.NetDef,
+    op_id: int,
+    input_id: int,
+    new_name: str,
+    from_producer: bool = False,
+):
+    """
+    Rename the op_id-th operator in predict_net, change it's input_id-th input's
+        name to the new_name. It also does automatic re-route and change
+        external_input and init_net if necessary.
+    - It requires the input is only consumed by this op.
+    - This function modifies predict_net and init_net in-place.
+    - When from_producer is enable, this also updates other operators that consumes
+        the same input. Be cautious because may trigger unintended behavior.
+    """
+    assert isinstance(predict_net, caffe2_pb2.NetDef)
+    assert isinstance(init_net, caffe2_pb2.NetDef)
+    init_net_ssa, init_net_versions = core.get_ssa(init_net)
+    predict_net_ssa, predict_net_versions = core.get_ssa(
+        predict_net, copy.deepcopy(init_net_versions)
+    )
+    versioned_inputs, versioned_outputs = predict_net_ssa[op_id]
+    old_name, version = versioned_inputs[input_id]
+    if from_producer:
+        producer_map = get_producer_map(predict_net_ssa)
+        if not (old_name, version) in producer_map:
+            raise NotImplementedError(
+                "Can't find producer, the input {} is probably from"
+                " init_net, this is not supported yet.".format(old_name)
+            )
+        producer = producer_map[(old_name, version)]
+        rename_op_output(predict_net, producer[0], producer[1], new_name)
+        return
+    def contain_targets(op_ssa):
+        return (old_name, version) in op_ssa[0]
+    is_consumer = [contain_targets(op_ssa) for op_ssa in predict_net_ssa]
+    if sum(is_consumer) > 1:
+        raise IllegalGraphTransformError(
+            (
+                "Input '{}' of operator(#{}) are consumed by other ops, please use"
+                + " rename_op_output on the producer instead. Offending op: \n{}"
+            ).format(old_name, op_id, predict_net.op[op_id])
+        )
+    # update init_net
+    _rename_versioned_blob_in_proto(
+        init_net, old_name, new_name, version, init_net_ssa, {}, init_net_versions
+    )
+    # update predict_net
+    _rename_versioned_blob_in_proto(
+        predict_net,
+        old_name,
+        new_name,
+        version,
+        predict_net_ssa,
+        init_net_versions,
+        predict_net_versions,
+    )
+def rename_op_output(predict_net: caffe2_pb2.NetDef, op_id: int, output_id: int, new_name: str):
+    """
+    Rename the op_id-th operator in predict_net, change it's output_id-th input's
+        name to the new_name. It also does automatic re-route and change
+        external_output and if necessary.
+    - It allows multiple consumers of its output.
+    - This function modifies predict_net in-place, doesn't need init_net.
+    """
+    assert isinstance(predict_net, caffe2_pb2.NetDef)
+    ssa, blob_versions = core.get_ssa(predict_net)
+    versioned_inputs, versioned_outputs = ssa[op_id]
+    old_name, version = versioned_outputs[output_id]
+    # update predict_net
+    _rename_versioned_blob_in_proto(
+        predict_net, old_name, new_name, version, ssa, {}, blob_versions
+    )
+def get_sub_graph_external_input_output(
+    predict_net: caffe2_pb2.NetDef, sub_graph_op_indices: List[int]
+) -> Tuple[List[Tuple[str, int]], List[Tuple[str, int]]]:
+    """
+    Return the list of external input/output of sub-graph,
+    each element is tuple of the name and corresponding version in predict_net.
+    external input/output is defined the same way as caffe2 NetDef.
+    """
+    ssa, versions = core.get_ssa(predict_net)
+    all_inputs = []
+    all_outputs = []
+    for op_id in sub_graph_op_indices:
+        all_inputs += [inp for inp in ssa[op_id][0] if inp not in all_inputs]
+        all_outputs += list(ssa[op_id][1])  # ssa output won't repeat
+    # for versioned blobs, external inputs are just those blob in all_inputs
+    # but not in all_outputs
+    ext_inputs = [inp for inp in all_inputs if inp not in all_outputs]
+    # external outputs are essentially outputs of this subgraph that are used
+    # outside of this sub-graph (including predict_net.external_output)
+    all_other_inputs = sum(
+        (ssa[i][0] for i in range(len(ssa)) if i not in sub_graph_op_indices),
+        [(outp, versions[outp]) for outp in predict_net.external_output],
+    )
+    ext_outputs = [outp for outp in all_outputs if outp in set(all_other_inputs)]
+    return ext_inputs, ext_outputs
+class DiGraph:
+    """A DAG representation of caffe2 graph, each vertice is a versioned blob."""
+    def __init__(self):
+        self.vertices = set()
+        self.graph = collections.defaultdict(list)
+    def add_edge(self, u, v):
+        self.graph[u].append(v)
+        self.vertices.add(u)
+        self.vertices.add(v)
+    # grab from https://www.geeksforgeeks.org/find-paths-given-source-destination/
+    def get_all_paths(self, s, d):
+        visited = {k: False for k in self.vertices}
+        path = []
+        all_paths = []
+        def _get_all_paths_util(graph, u, d, visited, path):
+            visited[u] = True
+            path.append(u)
+            if u == d:
+                all_paths.append(copy.deepcopy(path))
+            else:
+                for i in graph[u]:
+                    if not visited[i]:
+                        _get_all_paths_util(graph, i, d, visited, path)
+            path.pop()
+            visited[u] = False
+        _get_all_paths_util(self.graph, s, d, visited, path)
+        return all_paths
+    @staticmethod
+    def from_ssa(ssa):
+        graph = DiGraph()
+        for op_id in range(len(ssa)):
+            for inp in ssa[op_id][0]:
+                for outp in ssa[op_id][1]:
+                    graph.add_edge(inp, outp)
+        return graph
+def _get_dependency_chain(ssa, versioned_target, versioned_source):
+    """
+    Return the index list of relevant operator to produce target blob from source blob,
+        if there's no dependency, return empty list.
+    """
+    # finding all paths between nodes can be O(N!), thus we can only search
+    # in the subgraph using the op starting from the first consumer of source blob
+    # to the producer of the target blob.
+    consumer_map = get_consumer_map(ssa)
+    producer_map = get_producer_map(ssa)
+    start_op = min(x[0] for x in consumer_map[versioned_source]) - 15
+    end_op = (
+        producer_map[versioned_target][0] + 15 if versioned_target in producer_map else start_op
+    )
+    sub_graph_ssa = ssa[start_op : end_op + 1]
+    if len(sub_graph_ssa) > 30:
+        logger.warning(
+            "Subgraph bebetween {} and {} is large (from op#{} to op#{}), it"
+            " might take non-trival time to find all paths between them.".format(
+                versioned_source, versioned_target, start_op, end_op
+            )
+        )
+    dag = DiGraph.from_ssa(sub_graph_ssa)
+    paths = dag.get_all_paths(versioned_source, versioned_target)  # include two ends
+    ops_in_paths = [[producer_map[blob][0] for blob in path[1:]] for path in paths]
+    return sorted(set().union(*[set(ops) for ops in ops_in_paths]))
+def identify_reshape_sub_graph(predict_net: caffe2_pb2.NetDef) -> List[List[int]]:
+    """
+    Idenfity the reshape sub-graph in a protobuf.
+    The reshape sub-graph is defined as matching the following pattern:
+    (input_blob) -> Op_1 -> ... -> Op_N -> (new_shape) -─┐
+        └-------------------------------------------> Reshape -> (output_blob)
+    Return:
+        List of sub-graphs, each sub-graph is represented as a list of indices
+        of the relavent ops, [Op_1, Op_2, ..., Op_N, Reshape]
+    """
+    ssa, _ = core.get_ssa(predict_net)
+    ret = []
+    for i, op in enumerate(predict_net.op):
+        if op.type == "Reshape":
+            assert len(op.input) == 2
+            input_ssa = ssa[i][0]
+            data_source = input_ssa[0]
+            shape_source = input_ssa[1]
+            op_indices = _get_dependency_chain(ssa, shape_source, data_source)
+            ret.append(op_indices + [i])
+    return ret
+def remove_reshape_for_fc(predict_net, params):
+    """
+    In PyTorch nn.Linear has to take 2D tensor, this often leads to reshape
+        a 4D tensor to 2D by calling .view(). However this (dynamic) reshaping
+        doesn't work well with ONNX and Int8 tools, and cause using extra
+        ops (eg. ExpandDims) that might not be available on mobile.
+    Luckily Caffe2 supports 4D tensor for FC, so we can remove those reshape
+        after exporting ONNX model.
+    """
+    from caffe2.python import core
+    # find all reshape sub-graph that can be removed, which is now all Reshape
+    # sub-graph whose output is only consumed by FC.
+    # TODO: to make it safer, we may need the actually value to better determine
+    # if a Reshape before FC is removable.
+    reshape_sub_graphs = identify_reshape_sub_graph(predict_net)
+    sub_graphs_to_remove = []
+    for reshape_sub_graph in reshape_sub_graphs:
+        reshape_op_id = reshape_sub_graph[-1]
+        assert predict_net.op[reshape_op_id].type == "Reshape"
+        ssa, _ = core.get_ssa(predict_net)
+        reshape_output = ssa[reshape_op_id][1][0]
+        consumers = [i for i in range(len(ssa)) if reshape_output in ssa[i][0]]
+        if all(predict_net.op[consumer].type == "FC" for consumer in consumers):
+            # safety check if the sub-graph is isolated, for this reshape sub-graph,
+            # it means it has one non-param external input and one external output.
+            ext_inputs, ext_outputs = get_sub_graph_external_input_output(
+                predict_net, reshape_sub_graph
+            )
+            non_params_ext_inputs = [inp for inp in ext_inputs if inp[1] != 0]
+            if len(non_params_ext_inputs) == 1 and len(ext_outputs) == 1:
+                sub_graphs_to_remove.append(reshape_sub_graph)
+    # perform removing subgraph by:
+    # 1: rename the Reshape's output to its input, then the graph can be
+    #   seen as in-place itentify, meaning whose external input/output are the same.
+    # 2: simply remove those ops.
+    remove_op_ids = []
+    params_to_remove = []
+    for sub_graph in sub_graphs_to_remove:
+        logger.info(
+            "Remove Reshape sub-graph:\n{}".format(
+                "".join(["(#{:>4})\n{}".format(i, predict_net.op[i]) for i in sub_graph])
+            )
+        )
+        reshape_op_id = sub_graph[-1]
+        new_reshap_output = predict_net.op[reshape_op_id].input[0]
+        rename_op_output(predict_net, reshape_op_id, 0, new_reshap_output)
+        ext_inputs, ext_outputs = get_sub_graph_external_input_output(predict_net, sub_graph)
+        non_params_ext_inputs = [inp for inp in ext_inputs if inp[1] != 0]
+        params_ext_inputs = [inp for inp in ext_inputs if inp[1] == 0]
+        assert len(non_params_ext_inputs) == 1 and len(ext_outputs) == 1
+        assert ext_outputs[0][0] == non_params_ext_inputs[0][0]
+        assert ext_outputs[0][1] == non_params_ext_inputs[0][1] + 1
+        remove_op_ids.extend(sub_graph)
+        params_to_remove.extend(params_ext_inputs)
+    predict_net = copy.deepcopy(predict_net)
+    new_ops = [op for i, op in enumerate(predict_net.op) if i not in remove_op_ids]
+    del predict_net.op[:]
+    predict_net.op.extend(new_ops)
+    for versioned_params in params_to_remove:
+        name = versioned_params[0]
+        logger.info("Remove params: {} from init_net and predict_net.external_input".format(name))
+        del params[name]
+        predict_net.external_input.remove(name)
+    return predict_net, params
+def fuse_copy_between_cpu_and_gpu(predict_net: caffe2_pb2.NetDef):
+    """
+    In-place fuse extra copy ops between cpu/gpu for the following case:
+        a -CopyAToB-> b -CopyBToA> c1 -NextOp1-> d1
+                        -CopyBToA> c2 -NextOp2-> d2
+    The fused network will look like:
+        a -NextOp1-> d1
+          -NextOp2-> d2
+    """
+    _COPY_OPS = ["CopyCPUToGPU", "CopyGPUToCPU"]
+    def _fuse_once(predict_net):
+        ssa, blob_versions = core.get_ssa(predict_net)
+        consumer_map = get_consumer_map(ssa)
+        versioned_external_output = [
+            (name, blob_versions[name]) for name in predict_net.external_output
+        ]
+        for op_id, op in enumerate(predict_net.op):
+            if op.type in _COPY_OPS:
+                fw_copy_versioned_output = ssa[op_id][1][0]
+                consumer_ids = [x[0] for x in consumer_map[fw_copy_versioned_output]]
+                reverse_op_type = _COPY_OPS[1 - _COPY_OPS.index(op.type)]
+                is_fusable = (
+                    len(consumer_ids) > 0
+                    and fw_copy_versioned_output not in versioned_external_output
+                    and all(
+                        predict_net.op[_op_id].type == reverse_op_type
+                        and ssa[_op_id][1][0] not in versioned_external_output
+                        for _op_id in consumer_ids
+                    )
+                )
+                if is_fusable:
+                    for rv_copy_op_id in consumer_ids:
+                        # making each NextOp uses "a" directly and removing Copy ops
+                        rs_copy_versioned_output = ssa[rv_copy_op_id][1][0]
+                        next_op_id, inp_id = consumer_map[rs_copy_versioned_output][0]
+                        predict_net.op[next_op_id].input[inp_id] = op.input[0]
+                    # remove CopyOps
+                    new_ops = [
+                        op
+                        for i, op in enumerate(predict_net.op)
+                        if i != op_id and i not in consumer_ids
+                    ]
+                    del predict_net.op[:]
+                    predict_net.op.extend(new_ops)
+                    return True
+        return False
+    # _fuse_once returns False is nothing can be fused
+    while _fuse_once(predict_net):
+        pass
+def remove_dead_end_ops(net_def: caffe2_pb2.NetDef):
+    """remove ops if its output is not used or not in external_output"""
+    ssa, versions = core.get_ssa(net_def)
+    versioned_external_output = [(name, versions[name]) for name in net_def.external_output]
+    consumer_map = get_consumer_map(ssa)
+    removed_op_ids = set()
+    def _is_dead_end(versioned_blob):
+        return not (
+            versioned_blob in versioned_external_output
+            or (
+                len(consumer_map[versioned_blob]) > 0
+                and all(x[0] not in removed_op_ids for x in consumer_map[versioned_blob])
+            )
+        )
+    for i, ssa_i in reversed(list(enumerate(ssa))):
+        versioned_outputs = ssa_i[1]
+        if all(_is_dead_end(outp) for outp in versioned_outputs):
+            removed_op_ids.add(i)
+    # simply removing those deadend ops should have no effect to external_output
+    new_ops = [op for i, op in enumerate(net_def.op) if i not in removed_op_ids]
+    del net_def.op[:]
+    net_def.op.extend(new_ops)