Initial commit

fbshipit-source-id: f4a8ba78691d8cf46e003ef0bd2e95f170932778

Initial commit
fbshipit-source-id: f4a8ba78691d8cf46e003ef0bd2e95f170932778
f23248c0 · facebook-github-bot · f23248c0 · f23248c0 · f23248c0 · f23248c0
Commit f23248c0 authored Mar 02, 2021 by facebook-github-bot
20 changed files
--- a/d2go/utils/export_utils.py
+++ b/d2go/utils/export_utils.py
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+
+import sys
+import json
+import importlib
+import dataclasses
+from caffe2.proto import caffe2_pb2
+from detectron2.export.caffe2_modeling import (
+    META_ARCH_CAFFE2_EXPORT_TYPE_MAP,
+    convert_batched_inputs_to_c2_format,
+)
+from detectron2.export.shared import get_pb_arg_vali, get_pb_arg_vals
+from detectron2.modeling.postprocessing import detector_postprocess
+
+
+class D2Caffe2MetaArchPreprocessFunc(object):
+    def __init__(self, size_divisibility, device):
+        self.size_divisibility = size_divisibility
+        self.device = device
+
+    def __call__(self, inputs):
+        data, im_info = convert_batched_inputs_to_c2_format(
+            inputs, self.size_divisibility, self.device
+        )
+        return (data, im_info)
+
+    @staticmethod
+    def get_params(cfg, model):
+        fake_predict_net = caffe2_pb2.NetDef()
+        model.encode_additional_info(fake_predict_net, None)
+        size_divisibility = get_pb_arg_vali(fake_predict_net, "size_divisibility", 0)
+        device = get_pb_arg_vals(fake_predict_net, "device", b"cpu").decode("ascii")
+        return {
+            "size_divisibility": size_divisibility,
+            "device": device,
+        }
+
+
+class D2Caffe2MetaArchPostprocessFunc(object):
+    def __init__(self, external_input, external_output, encoded_info):
+        self.external_input = external_input
+        self.external_output = external_output
+        self.encoded_info = encoded_info
+
+    def __call__(self, inputs, tensor_inputs, tensor_outputs):
+        encoded_info = self.encoded_info.encode("ascii")
+        fake_predict_net = caffe2_pb2.NetDef().FromString(encoded_info)
+        meta_architecture = get_pb_arg_vals(fake_predict_net, "meta_architecture", None)
+        meta_architecture = meta_architecture.decode("ascii")
+
+        model_class = META_ARCH_CAFFE2_EXPORT_TYPE_MAP[meta_architecture]
+        convert_outputs = model_class.get_outputs_converter(fake_predict_net, None)
+        c2_inputs = tensor_inputs
+        c2_results = dict(zip(self.external_output, tensor_outputs))
+        return convert_outputs(inputs, c2_inputs, c2_results)
+
+    @staticmethod
+    def get_params(cfg, model):
+        # NOTE: the post processing has different values for different meta
+        # architectures, here simply relying Caffe2 meta architecture to encode info
+        # into a NetDef and storing it as whole.
+        fake_predict_net = caffe2_pb2.NetDef()
+        model.encode_additional_info(fake_predict_net, None)
+        encoded_info = fake_predict_net.SerializeToString().decode("ascii")
+
+        # HACK: Caffe2MetaArch's post processing requires the blob name of model output,
+        # this information is missed for torchscript. There'no easy way to know this
+        # unless using NamedTuple for tracing.
+        external_input = ["data", "im_info"]
+        if cfg.MODEL.META_ARCHITECTURE == "GeneralizedRCNN":
+            external_output = ["bbox_nms", "score_nms", "class_nms"]
+            if cfg.MODEL.MASK_ON:
+                external_output.extend(["mask_fcn_probs"])
+            if cfg.MODEL.KEYPOINT_ON:
+                if cfg.EXPORT_CAFFE2.USE_HEATMAP_MAX_KEYPOINT:
+                    external_output.extend(["keypoints_out"])
+                else:
+                    external_output.extend(["kps_score"])
+        else:
+            raise NotImplementedError("")
+
+        return {
+            "external_input": external_input,
+            "external_output": external_output,
+            "encoded_info": encoded_info,
+        }
+
+
+def dataclass_object_dump(ob):
+    datacls = type(ob)
+    if not dataclasses.is_dataclass(datacls):
+        raise TypeError(f"Expected dataclass instance, got '{datacls!r}' object")
+    mod = sys.modules.get(datacls.__module__)
+    if mod is None or not hasattr(mod, datacls.__qualname__):
+        raise ValueError(f"Can't resolve '{datacls!r}' reference")
+    ref = f"{datacls.__module__}.{datacls.__qualname__}"
+    fields = (f.name for f in dataclasses.fields(ob))
+    return {**{f: getattr(ob, f) for f in fields}, "__dataclass__": ref}
+
+
+
+def dataclass_object_load(d):
+    ref = d.pop("__dataclass__", None)
+    if ref is None:
+        return d
+    try:
+        modname, hasdot, qualname = ref.rpartition(".")
+        module = importlib.import_module(modname)
+        datacls = getattr(module, qualname)
+        if not dataclasses.is_dataclass(datacls) or not isinstance(datacls, type):
+            raise ValueError
+        return datacls(**d)
+    except (ModuleNotFoundError, ValueError, AttributeError, TypeError):
+        raise ValueError(f"Invalid dataclass reference {ref!r}") from None
+
+
+class D2TracingAdapterPreprocessFunc(object):
+    def __call__(self, inputs):
+        assert len(inputs) == 1, "only support single batch"
+        return inputs[0]["image"]
+
+
+class D2TracingAdapterPostFunc(object):
+    def __init__(self, outputs_schema_json):
+        self.outputs_schema = json.loads(
+            outputs_schema_json, object_hook=dataclass_object_load
+        )
+
+    def __call__(self, inputs, tensor_inputs, tensor_outputs):
+        results_per_image = self.outputs_schema(tensor_outputs)
+
+        assert len(inputs) == 1, "only support single batch"
+        width, height = inputs[0]["width"], inputs[0]["height"]
+        r = detector_postprocess(results_per_image, height, width)
+        return [{"instances": r}]
--- a/d2go/utils/flop_calculator.py
+++ b/d2go/utils/flop_calculator.py
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import copy
+import logging
+
+import detectron2.utils.comm as comm
+import mobile_cv.lut.lib.pt.flops_utils as flops_utils
+from d2go.utils.helper import run_once
+
+
+logger = logging.getLogger(__name__)
+
+
+def print_flops(model, first_batch):
+    logger.info("Evaluating model's number of parameters and FLOPS")
+    model_flops = copy.deepcopy(model)
+    model_flops.eval()
+    fest = flops_utils.FlopsEstimation(model_flops)
+    with fest.enable():
+        model_flops(first_batch)
+        fest.add_flops_info()
+        model_str = str(model_flops)
+        logger.info(model_str)
+    return model_str
+
+
+# NOTE: the logging can be too long and messsy when printing flops multiple
+# times, especially when running eval during training, thus using `run_once`
+# to limit it. TODO: log the flops more concisely.
+@run_once()
+def add_print_flops_callback(cfg, model, disable_after_callback=True):
+    def _print_flops_callback(self, model, model_data):
+        self.add_flops_info()
+        logger.info("Callback: model flops info:\n{}".format(model))
+
+        def _guess_batch_size():
+            # Inputs are meta-arch dependent, the most general solution will be
+            # adding a function like `get_batch_size()` to each meta arch
+            ret = 1
+            try:
+                model_input_shapes = model_data(model)["input_shapes"]
+                assert isinstance(model_input_shapes, list)
+                assert len(model_input_shapes) > 0
+                # assuming the first input is a list of images
+                ret = len(model_input_shapes[0])
+            except Exception:
+                ret = cfg.SOLVER.IMS_PER_BATCH // comm.get_world_size()
+                logger.warning(
+                    "Could not get batch size, compute from"
+                    f" `cfg.SOLVER.IMS_PER_BATCH`={ret}"
+                )
+                pass
+
+            return ret
+
+        nparams, nflops = self.get_flops()
+        batch_size = _guess_batch_size()
+        nflops_single = nflops / batch_size
+        logger.info(
+            f"Model parameters (M): {nparams}, "
+            f"MFlops (batch_size={batch_size}): {nflops}, "
+            f"MFlops (batch_size=1): {nflops_single}"
+        )
+
+        if disable_after_callback:
+            self.set_enable(False)
+
+    fest = flops_utils.FlopsEstimation(model).set_callback(_print_flops_callback)
+    logger.info("Added callback to log flops info after the first inference")
+    fest.set_enable(True)
+    return fest
--- a/d2go/utils/get_default_cfg.py
+++ b/d2go/utils/get_default_cfg.py
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from d2go.config import CfgNode as CN
+from d2go.data.build import (
+    add_weighted_training_sampler_default_configs,
+)
+from d2go.data.config import add_d2go_data_default_configs
+from d2go.modeling.backbone.fbnet_cfg import (
+    add_bifpn_default_configs,
+    add_fbnet_v2_default_configs,
+)
+from d2go.modeling import kmeans_anchors, model_ema
+from d2go.modeling.model_freezing_utils import add_model_freezing_configs
+from d2go.modeling.quantization import add_quantization_default_configs
+from d2go.modeling.subclass import add_subclass_configs
+
+
+def add_tensorboard_default_configs(_C):
+    _C.TENSORBOARD = CN()
+    # Output from dataloader will be written to tensorboard at this frequency
+    _C.TENSORBOARD.TRAIN_LOADER_VIS_WRITE_PERIOD = 20
+    # This controls max number of images over all batches, be considerate when
+    # increasing this number because it takes disk space and slows down the training
+    _C.TENSORBOARD.TRAIN_LOADER_VIS_MAX_IMAGES = 16
+    # Max number of images per dataset to visualize in tensorboard during evaluation
+    _C.TENSORBOARD.TEST_VIS_MAX_IMAGES = 16
+
+    # TENSORBOARD.LOG_DIR will be determined solely by OUTPUT_DIR
+    _C.register_deprecated_key("TENSORBOARD.LOG_DIR")
+
+
+def add_abnormal_checker_configs(_C):
+    _C.ABNORMAL_CHECKER = CN()
+    # check and log the iteration with bad losses if enabled
+    _C.ABNORMAL_CHECKER.ENABLED = False
+
+
+def get_default_cfg(_C):
+    # _C.MODEL.FBNET...
+    add_fbnet_v2_default_configs(_C)
+    # _C.MODEL.FROZEN_LAYER_REG_EXP
+    add_model_freezing_configs(_C)
+    # _C.MODEL other models
+    model_ema.add_model_ema_configs(_C)
+    # _C.D2GO_DATA...
+    add_d2go_data_default_configs(_C)
+    # _C.TENSORBOARD...
+    add_tensorboard_default_configs(_C)
+    # _C.MODEL.KMEANS...
+    kmeans_anchors.add_kmeans_anchors_cfg(_C)
+    # _C.QUANTIZATION
+    add_quantization_default_configs(_C)
+    # _C.DATASETS.TRAIN_REPEAT_FACTOR
+    add_weighted_training_sampler_default_configs(_C)
+    # _C.ABNORMAL_CHECKER
+    add_abnormal_checker_configs(_C)
+    # _C.MODEL.SUBCLASS
+    add_subclass_configs(_C)
+
+    # Set find_unused_parameters for DistributedDataParallel.
+    _C.MODEL.DDP_FIND_UNUSED_PARAMETERS = False
+
+    # Set default optimizer
+    _C.SOLVER.OPTIMIZER = "sgd"
+    _C.SOLVER.LR_MULTIPLIER_OVERWRITE = []
+
+    # Default world size in D2 is 0, which means scaling is not applied. For D2Go
+    # auto scale is encouraged, setting it to 8
+    assert _C.SOLVER.REFERENCE_WORLD_SIZE == 0
+    _C.SOLVER.REFERENCE_WORLD_SIZE = 8
+    # Besides scaling default D2 configs, also scale quantization configs
+    _C.SOLVER.AUTO_SCALING_METHODS = [
+        "default_scale_d2_configs",
+        "default_scale_quantization_configs",
+    ]
+    return _C
--- a/d2go/utils/helper.py
+++ b/d2go/utils/helper.py
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#!/usr/bin/python
+import errno
+import importlib
+import inspect
+import logging
+import math
+import os
+import re
+import tempfile
+import zipfile
+import pickle
+import signal
+import sys
+import threading
+import time
+import traceback
+import typing
+import warnings
+import pkg_resources
+from contextlib import contextmanager
+from functools import partial
+from random import random
+import six
+from functools import wraps
+
+from typing import (
+    Any,
+    Callable,
+    Iterable,
+    List,
+    Mapping,
+    NamedTuple,
+    Optional,
+    Tuple,
+    Type,
+    TypeVar,
+    Union,
+)
+
+import torch
+import detectron2.utils.comm as comm
+from detectron2.checkpoint import DetectionCheckpointer
+from detectron2.config import get_cfg
+from detectron2.data import MetadataCatalog
+from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, hooks, launch
+from detectron2.evaluation import (
+    CityscapesInstanceEvaluator,
+    CityscapesSemSegEvaluator,
+    COCOEvaluator,
+    COCOPanopticEvaluator,
+    DatasetEvaluators,
+    LVISEvaluator,
+    PascalVOCDetectionEvaluator,
+    SemSegEvaluator,
+    verify_results,
+)
+
+T = TypeVar("T")
+CallbackMapping = Mapping[Callable, Optional[Iterable[Any]]]
+FuncType = Callable[..., Any]
+F = TypeVar("F", bound=FuncType)
+RT = TypeVar("RT")
+NT = TypeVar("T", bound=NamedTuple)
+
+from detectron2.utils.events import TensorboardXWriter
+
+class MultipleFunctionCallError(Exception):
+    pass
+
+def run_once(
+    raise_on_multiple: bool = False,
+    # pyre-fixme[34]: `Variable[T]` isn't present in the function's parameters.
+) -> Callable[[Callable[..., T]], Callable[..., T]]:
+    """
+    A decorator to wrap a function such that it only ever runs once
+    Useful, for example, with exit handlers that could be run via atexit or
+    via a signal handler. The decorator will cache the result of the first call
+    and return it on subsequent calls. If `raise_on_multiple` is set, any call
+    to the function after the first one will raise a
+    `MultipleFunctionCallError`.
+    """
+
+    def decorator(func: Callable[..., T]) -> (Callable[..., T]):
+        signal: List[T] = []
+
+        @wraps(func)
+        def wrapper(*args, **kwargs) -> T:
+            if signal:
+                if raise_on_multiple:
+                    raise MultipleFunctionCallError(
+                        "Function %s was called multiple times" % func.__name__
+                    )
+                return signal[0]
+            signal.append(func(*args, **kwargs))
+            return signal[0]
+
+        return wrapper
+
+    return decorator
+
+
+class retryable(object):
+    """Fake retryable function
+    """
+    def __init__(self, num_tries=1, sleep_time=0.1):
+        pass
+
+    def __call__(self, func: F) -> F:
+        return func
+
+
+# pyre-fixme[3]: Return type must be annotated.
+# pyre-fixme[2]: Parameter must be annotated.
+def get_dir_path(relative_path):
+    """Return a path for a directory in this package, extracting if necessary
+
+    For an entire directory within the par file (zip, fastzip) or lpar
+    structure, this function will check to see if the contents are extracted;
+    extracting each file that has not been extracted.  It returns the path of
+    a directory containing the expected contents, making sure permissions are
+    correct.
+
+    Returns a string path, throws exeption on error
+    """
+    return os.path.dirname(importlib.import_module(relative_path).__file__)
+
+
+# copy util function for oss
+def alias(x, name, is_backward=False):
+    if not torch.onnx.is_in_onnx_export():
+        return x
+    assert isinstance(x, torch.Tensor)
+    return torch.ops._caffe2.AliasWithName(x, name, is_backward=is_backward)
+
+class D2Trainer(DefaultTrainer):
+    @classmethod
+    def build_evaluator(cls, cfg, dataset_name, output_folder=None):
+        """
+        Create evaluator(s) for a given dataset.
+        This uses the special metadata "evaluator_type" associated with each builtin dataset.
+        For your own dataset, you can simply create an evaluator manually in your
+        script and do not have to worry about the hacky if-else logic here.
+        """
+        if output_folder is None:
+            output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
+        evaluator_list = []
+        evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type
+        if evaluator_type in ["sem_seg", "coco_panoptic_seg"]:
+            evaluator_list.append(
+                SemSegEvaluator(
+                    dataset_name,
+                    distributed=True,
+                    output_dir=output_folder,
+                )
+            )
+        if evaluator_type in ["coco", "coco_panoptic_seg"]:
+            evaluator_list.append(COCOEvaluator(dataset_name, output_dir=output_folder))
+        if evaluator_type == "coco_panoptic_seg":
+            evaluator_list.append(COCOPanopticEvaluator(dataset_name, output_folder))
+        if evaluator_type == "cityscapes_instance":
+            assert (
+                torch.cuda.device_count() >= comm.get_rank()
+            ), "CityscapesEvaluator currently do not work with multiple machines."
+            return CityscapesInstanceEvaluator(dataset_name)
+        if evaluator_type == "cityscapes_sem_seg":
+            assert (
+                torch.cuda.device_count() >= comm.get_rank()
+            ), "CityscapesEvaluator currently do not work with multiple machines."
+            return CityscapesSemSegEvaluator(dataset_name)
+        elif evaluator_type == "pascal_voc":
+            return PascalVOCDetectionEvaluator(dataset_name)
+        elif evaluator_type == "lvis":
+            return LVISEvaluator(dataset_name, output_dir=output_folder)
+        if len(evaluator_list) == 0:
+            raise NotImplementedError(
+                "no Evaluator for the dataset {} with the type {}".format(
+                    dataset_name, evaluator_type
+                )
+            )
+        elif len(evaluator_list) == 1:
+            return evaluator_list[0]
+        return DatasetEvaluators(evaluator_list)
+
+def reroute_config_path(path: str) -> str:
+    """
+    Supporting rerouting the config files for convenience:
+        d2go:// -> mobile-vision/d2go/...
+        detectron2go:// -> mobile-vision/d2go/configs/...
+        detectron2:// -> vision/fair/detectron2/configs/...
+        flow:// -> fblearner/flow/projects/mobile_vision/detectron2go/...
+        mv_experimental:// -> mobile-vision/experimental/...
+            (see //mobile-vision/experimental:mv_experimental_d2go_yaml_files)
+    Those config are considered as code, so they'll reflect your current checkout,
+        try using canary if you have local changes.
+    """
+
+    if path.startswith("d2go://"):
+        rel_path = path[len("d2go://") :]
+        config_in_resource = pkg_resources.resource_filename(
+            "d2go.model_zoo", os.path.join("configs", rel_path)
+        )
+        return config_in_resource
+    elif path.startswith("detectron2go://"):
+        rel_path = path[len("detectron2go://") :]
+        config_in_resource = pkg_resources.resource_filename(
+            "d2go.model_zoo", os.path.join("configs", rel_path)
+        )
+        return config_in_resource
+    elif path.startswith("detectron2://"):
+        rel_path = path[len("detectron2://") :]
+        config_in_resource = pkg_resources.resource_filename(
+            "detectron2.model_zoo", os.path.join("configs", rel_path)
+        )
+        return config_in_resource
+    elif path.startswith("mv_experimental://"):
+        rel_path = path[len("mv_experimental://") :]
+        # pyre-fixme[21]: Could not find module `mv_experimental_d2go_yaml_files`.
+        import mv_experimental_d2go_yaml_files
+
+        package_path = get_dir_path(mv_experimental_d2go_yaml_files.__name__)
+        return os.path.join(package_path, rel_path)
+
+    return path
--- a/d2go/utils/launch_environment.py
+++ b/d2go/utils/launch_environment.py
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+def get_launch_environment():
+    return "local"
--- a/d2go/utils/misc.py
+++ b/d2go/utils/misc.py
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+
+import logging
+import os
+from typing import Dict
+import warnings
+
+import detectron2.utils.comm as comm
+from d2go.config import CfgNode
+from fvcore.common.file_io import PathManager
+from tabulate import tabulate
+
+from .tensorboard_log_util import get_tensorboard_log_dir
+
+logger = logging.getLogger(__name__)
+
+def check_version(library, min_version, warning_only=False):
+    """Check the version of the library satisfies the provided minimum version.
+    An exception is thrown if the check does not pass.
+    Parameters
+    ----------
+    min_version : str
+        Minimum version
+    warning_only : bool
+        Printing a warning instead of throwing an exception.
+    """
+    from distutils.version import LooseVersion
+    version = library.__version__
+    bad_version = LooseVersion(version) < LooseVersion(min_version)
+    if bad_version:
+        msg = f'Installed {library.__name__} version {version} does not satisfy the ' \
+              f'minimum required version {min_version}'
+        if warning_only:
+            warnings.warn(msg)
+        else:
+            raise AssertionError(msg)
+        return False
+    return True
+
+def metrics_dict_to_metrics_table(dic):
+    assert isinstance(dic, dict)
+    ret = []
+    for key in sorted(dic.keys()):
+        value = dic[key]
+        if isinstance(value, dict):
+            for sub_metrics in metrics_dict_to_metrics_table(value):
+                ret.append([key] + sub_metrics)
+        else:
+            ret.append([key, value])
+    return ret
+
+
+def print_metrics_table(metrics_dic):
+    metrics_table = metrics_dict_to_metrics_table(metrics_dic)
+    metrics_tabulate = tabulate(
+        metrics_table,
+        tablefmt="pipe",
+        headers=["model", "dataset", "task", "metric", "score"],
+    )
+    logger.info("Metrics table: \n" + metrics_tabulate)
+
+
+def dump_trained_model_configs(output_dir: str, trained_cfgs: Dict[str, CfgNode]) -> Dict[str, str]:
+    """Writes trained model config files to output_dir.
+
+    Args:
+        output_dir: output file directory.
+        trained_cfgs: map from model name to the config of trained model.
+
+    Returns:
+        A map of model name to model config path.
+    """
+    trained_model_configs = {}
+    trained_model_config_dir = os.path.join(output_dir, "trained_model_configs")
+    PathManager.mkdirs(trained_model_config_dir)
+    for name, trained_cfg in trained_cfgs.items():
+        config_file = os.path.join(trained_model_config_dir, "{}.yaml".format(name))
+        trained_model_configs[name] = config_file
+        if comm.is_main_process():
+            with PathManager.open(config_file, "w") as f:
+                f.write(trained_cfg.dump())
+    return trained_model_configs
--- a/d2go/utils/prepare_for_export.py
+++ b/d2go/utils/prepare_for_export.py
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+
+import json
+import logging
+
+import torch
+from d2go.export.api import PredictorExportConfig
+from detectron2.export.caffe2_modeling import META_ARCH_CAFFE2_EXPORT_TYPE_MAP
+from mobile_cv.predictor.api import FuncInfo
+from detectron2.export.flatten import TracingAdapter
+from detectron2.export.torchscript_patch import patch_builtin_len
+from d2go.utils.export_utils import (D2Caffe2MetaArchPreprocessFunc,
+        D2Caffe2MetaArchPostprocessFunc, D2TracingAdapterPreprocessFunc, D2TracingAdapterPostFunc,
+        dataclass_object_dump)
+
+logger = logging.getLogger(__name__)
+
+
+def d2_meta_arch_prepare_for_export(self, cfg, inputs, export_scheme):
+
+    if "torchscript" in export_scheme and "@tracing" in export_scheme:
+
+        def inference_func(model, image):
+            inputs = [{"image": image}]
+            return model.inference(inputs, do_postprocess=False)[0]
+
+        def data_generator(x):
+            return (x[0]["image"],)
+
+        image = data_generator(inputs)[0]
+        wrapper = TracingAdapter(self, image, inference_func)
+        wrapper.eval()
+
+        # HACK: outputs_schema can only be obtained after running tracing, but
+        # PredictorExportConfig requires a pre-defined postprocessing function, this
+        # causes tracing to run twice.
+        logger.info("tracing the model to get outputs_schema ...")
+        with torch.no_grad(), patch_builtin_len():
+            _ = torch.jit.trace(wrapper, (image,))
+        outputs_schema_json = json.dumps(
+            wrapper.outputs_schema, default=dataclass_object_dump
+        )
+
+        return PredictorExportConfig(
+            model=wrapper,
+            data_generator=data_generator,
+            preprocess_info=FuncInfo.gen_func_info(
+                D2TracingAdapterPreprocessFunc, params={}
+            ),
+            postprocess_info=FuncInfo.gen_func_info(
+                D2TracingAdapterPostFunc,
+                params={"outputs_schema_json": outputs_schema_json},
+            ),
+        )
+
+    if cfg.MODEL.META_ARCHITECTURE in META_ARCH_CAFFE2_EXPORT_TYPE_MAP:
+        C2MetaArch = META_ARCH_CAFFE2_EXPORT_TYPE_MAP[cfg.MODEL.META_ARCHITECTURE]
+        c2_compatible_model = C2MetaArch(cfg, self)
+
+        preprocess_info = FuncInfo.gen_func_info(
+            D2Caffe2MetaArchPreprocessFunc,
+            params=D2Caffe2MetaArchPreprocessFunc.get_params(cfg, c2_compatible_model),
+        )
+        postprocess_info = FuncInfo.gen_func_info(
+            D2Caffe2MetaArchPostprocessFunc,
+            params=D2Caffe2MetaArchPostprocessFunc.get_params(cfg, c2_compatible_model),
+        )
+
+        preprocess_func = preprocess_info.instantiate()
+
+        return PredictorExportConfig(
+            model=c2_compatible_model,
+            # Caffe2MetaArch takes a single tuple as input (which is the return of
+            # preprocess_func), data_generator requires all positional args as a tuple.
+            data_generator=lambda x: (preprocess_func(x),),
+            preprocess_info=preprocess_info,
+            postprocess_info=postprocess_info,
+        )
+
+    raise NotImplementedError("Can't determine prepare_for_tracing!")
+
--- a/d2go/utils/tensorboard_log_util.py
+++ b/d2go/utils/tensorboard_log_util.py
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+
+import logging
+import os
+from functools import lru_cache
+
+def get_tensorboard_log_dir(output_dir):
+    return output_dir
--- a/d2go/utils/validation_monitor.py
+++ b/d2go/utils/validation_monitor.py
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+
+import logging
+import os
+import re
+import time
+
+from detectron2.checkpoint import DetectionCheckpointer
+from fvcore.common.file_io import PathManager
+from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint
+
+
+logger = logging.getLogger(__name__)
+
+
+def fetch_checkpoints_till_final(checkpoint_dir):
+    """
+    A generator that yields all checkpoint paths under the given directory, it'll
+        keep refreshing until model_final is found.
+    """
+
+    MIN_SLEEP_INTERVAL = 1.0  # in seconds
+    MAX_SLEEP_INTERVAL = 60.0  # in seconds
+    sleep_interval = MIN_SLEEP_INTERVAL
+
+    finished_checkpoints = set()
+
+    def _add_and_log(path):
+        finished_checkpoints.add(path)
+        logger.info("Found checkpoint: {}".format(path))
+        return path
+
+    def _log_and_sleep(sleep_interval):
+        logger.info(
+            "Sleep {} seconds while waiting for model_final.pth".format(sleep_interval)
+        )
+        time.sleep(sleep_interval)
+        return min(sleep_interval * 2, MAX_SLEEP_INTERVAL)
+
+    def _get_lightning_checkpoints(path: str):
+        return [
+            os.path.join(path, x)
+            for x in PathManager.ls(path)
+            if x.endswith(ModelCheckpoint.FILE_EXTENSION)
+            and not x.startswith(ModelCheckpoint.CHECKPOINT_NAME_LAST)
+        ]
+
+    while True:
+        if not PathManager.exists(checkpoint_dir):
+            sleep_interval = _log_and_sleep(sleep_interval)
+            continue
+
+        checkpoint_paths = DetectionCheckpointer(
+            None, save_dir=checkpoint_dir
+        ).get_all_checkpoint_files()
+        checkpoint_paths.extend(_get_lightning_checkpoints(checkpoint_dir))
+
+        final_model_path = None
+        periodic_checkpoints = []
+
+        for path in sorted(checkpoint_paths):
+            if path.endswith("model_final.pth") or path.endswith("model_final.ckpt"):
+                final_model_path = path
+                continue
+
+            if path.endswith(ModelCheckpoint.FILE_EXTENSION):
+                # Lightning checkpoint
+                model_iter = int(
+                    re.findall(
+                        r"(?<=step=)\d+(?={})".format(ModelCheckpoint.FILE_EXTENSION),
+                        path,
+                    )[0]
+                )
+            else:
+                model_iter = int(re.findall(r"(?<=model_)\d+(?=\.pth)", path)[0])
+            periodic_checkpoints.append((path, model_iter))
+
+        periodic_checkpoints = [
+            pc for pc in periodic_checkpoints if pc[0] not in finished_checkpoints
+        ]
+        periodic_checkpoints = sorted(periodic_checkpoints, key=lambda x: x[1])
+        for pc in periodic_checkpoints:
+            yield _add_and_log(pc[0])
+            sleep_interval = MIN_SLEEP_INTERVAL
+
+        if final_model_path is None:
+            sleep_interval = _log_and_sleep(sleep_interval)
+        else:
+            yield _add_and_log(final_model_path)
+            break
--- a/d2go/utils/visualization.py
+++ b/d2go/utils/visualization.py
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+
+from detectron2.data import DatasetCatalog, MetadataCatalog, detection_utils as utils
+from detectron2.evaluation import DatasetEvaluator
+from detectron2.modeling import META_ARCH_REGISTRY
+from detectron2.utils.events import get_event_storage
+from detectron2.utils.visualizer import Visualizer
+
+
+class VisualizerWrapper(object):
+    """
+    D2's Visualizer provides low-level APIs to draw common structures, such as
+    draw_instance_predictions/draw_sem_seg/overlay_instances. This class provides
+    the high-level interface for visualizing.
+    """
+
+    def __init__(self, cfg):
+        self.cfg = cfg
+
+    def _get_meta_arch_class(self):
+        return META_ARCH_REGISTRY.get(self.cfg.MODEL.META_ARCHITECTURE)
+
+    def visualize_train_input(self, input_dict):
+        """
+        Visulize a single input image of model (also the output from train loader)
+        used for training, this contains the data augmentation.
+        """
+        per_image = input_dict
+        cfg = self.cfg
+
+        # customization
+        if hasattr(self._get_meta_arch_class(), "visualize_train_input"):
+            return self._get_meta_arch_class().visualize_train_input(self, input_dict)
+
+        img = per_image["image"].permute(1, 2, 0).cpu().detach().numpy()
+        img = utils.convert_image_to_rgb(img, cfg.INPUT.FORMAT)
+        metadata = MetadataCatalog.get(cfg.DATASETS.TRAIN[0])
+        scale = 2.0
+        visualizer = Visualizer(img, metadata=metadata, scale=scale)
+
+        if "instances" in per_image:
+            target_fields = per_image["instances"].get_fields()
+            labels = [metadata.thing_classes[i] for i in target_fields["gt_classes"]]
+            vis = visualizer.overlay_instances(
+                labels=labels,
+                boxes=target_fields.get("gt_boxes", None),
+                masks=target_fields.get("gt_masks", None),
+                keypoints=target_fields.get("gt_keypoints", None),
+            )
+
+        if "sem_seg" in per_image:
+            vis = visualizer.draw_sem_seg(
+                per_image["sem_seg"], area_threshold=0, alpha=0.5
+            )
+
+        return vis.get_image()
+
+    def visualize_test_output(
+        self, dataset_name, dataset_mapper, input_dict, output_dict
+    ):
+        """
+        Visualize the output of model
+        """
+
+        # customization
+        if hasattr(self._get_meta_arch_class(), "visualize_test_output"):
+            return self._get_meta_arch_class().visualize_test_output(
+                self, dataset_name, dataset_mapper, input_dict, output_dict
+            )
+
+        image = dataset_mapper._read_image(input_dict, "RGB")
+        visualizer = Visualizer(image, metadata=MetadataCatalog.get(dataset_name))
+
+        if "panoptic_seg" in output_dict:
+            # NOTE: refer to https://fburl.com/diffusion/evarrhbh
+            raise NotImplementedError()
+        if "instances" in output_dict:
+            visualizer.draw_instance_predictions(output_dict["instances"].to("cpu"))
+        if "sem_seg" in output_dict:
+            visualizer.draw_sem_seg(
+                output_dict["sem_seg"].argmax(dim=0).to("cpu"),
+                area_threshold=0,
+                alpha=0.5,
+            )
+
+        return visualizer.get_output().get_image()
+
+    def visualize_dataset_dict(self, dataset_name, dataset_mapper, dataset_dict):
+        """
+        Visualize the dataset_dict
+        """
+        image = dataset_mapper._read_image(dataset_dict, "RGB")
+        visualizer = Visualizer(image, metadata=MetadataCatalog.get(dataset_name))
+        visualizer.draw_dataset_dict(dataset_dict)
+        return visualizer.get_output().get_image()
+
+
+class DataLoaderVisWrapper:
+    """
+    Wrap the data loader to visualize its output via TensorBoardX at given frequency.
+    """
+
+    def __init__(self, cfg, tbx_writer, data_loader):
+        self.tbx_writer = tbx_writer
+        self.data_loader = data_loader
+        self._visualizer = VisualizerWrapper(cfg)
+
+        self.log_frequency = cfg.TENSORBOARD.TRAIN_LOADER_VIS_WRITE_PERIOD
+        self.log_limit = cfg.TENSORBOARD.TRAIN_LOADER_VIS_MAX_IMAGES
+        assert self.log_frequency >= 0
+        assert self.log_limit >= 0
+        self._remaining = self.log_limit
+
+    def __iter__(self):
+        for data in self.data_loader:
+            self._maybe_write_vis(data)
+            yield data
+
+    def _maybe_write_vis(self, data):
+        try:
+            storage = get_event_storage()
+        except AssertionError:
+            # wrapped data loader might be used outside EventStorage, don't visualize
+            # anything
+            return
+
+        if (
+            self.log_frequency == 0
+            or not storage.iter % self.log_frequency == 0
+            or self._remaining <= 0
+        ):
+            return
+
+        length = min(len(data), self._remaining)
+        data = data[:length]
+        self._remaining -= length
+
+        for i, per_image in enumerate(data):
+            vis_image = self._visualizer.visualize_train_input(per_image)
+            tag = "train_loader_batch_{}/".format(storage.iter)
+            if "dataset_name" in per_image:
+                tag += per_image["dataset_name"] + "/"
+            if "file_name" in per_image:
+                tag += "img_{}/{}".format(i, per_image["file_name"])
+            self.tbx_writer._writer.add_image(
+                tag=tag,
+                img_tensor=vis_image,
+                global_step=storage.iter,
+                dataformats="HWC",
+            )
+
+
+class VisualizationEvaluator(DatasetEvaluator):
+    """
+    Visualize GT and prediction during evaluation. It doesn't calculate any
+        metrics, just uses evaluator's interface as hook.
+    """
+
+    # NOTE: the evaluator will be created for every eval (during training and
+    # after training), so the images will be logged multiple times, use a global
+    # counter to differentiate them in TB.
+    _counter = 0
+
+    def __init__(
+        self, cfg, tbx_writer, dataset_mapper, dataset_name, train_iter=None, tag_postfix=None
+    ):
+        self.tbx_writer = tbx_writer
+        self.dataset_mapper = dataset_mapper
+        self.dataset_name = dataset_name
+        self._visualizer = VisualizerWrapper(cfg)
+        self.train_iter = train_iter or VisualizationEvaluator._counter
+        self.tag_postfix = tag_postfix or ""
+
+        self.log_limit = max(cfg.TENSORBOARD.TEST_VIS_MAX_IMAGES, 0)
+        if self.log_limit > 0:
+            self._metadata = MetadataCatalog.get(dataset_name)
+            # NOTE: Since there's no GT from test loader, we need to get GT from
+            # the dataset_dict, this assumes the test data loader uses the item from
+            # dataset_dict in the default way.
+            self._dataset_dict = DatasetCatalog.get(dataset_name)
+            self._file_name_to_dataset_dict = {
+                dic["file_name"]: dic for dic in self._dataset_dict
+            }
+
+        VisualizationEvaluator._counter += 1
+        self.reset()
+
+    def reset(self):
+        self._iter = 0
+        self._log_remaining = self.log_limit
+
+    def process(self, inputs, outputs):
+        for input, output in zip(inputs, outputs):
+            if self._log_remaining <= 0:
+                return
+
+            file_name = input["file_name"]
+            dataset_dict = self._file_name_to_dataset_dict[file_name]
+            gt_img = self._visualizer.visualize_dataset_dict(
+                self.dataset_name, self.dataset_mapper, dataset_dict
+            )
+            pred_img = self._visualizer.visualize_test_output(
+                self.dataset_name, self.dataset_mapper, input, output
+            )
+
+            tag_base = f"{self.dataset_name}{self.tag_postfix}/eval_iter_{self._iter}/{file_name}"
+            self.tbx_writer._writer.add_image(
+                f"{tag_base}/GT",
+                gt_img,
+                self.train_iter,
+                dataformats="HWC",
+            )
+
+            if not isinstance(pred_img, dict):
+                pred_img = {"Pred": pred_img}
+
+            for img_type in pred_img.keys():
+                self.tbx_writer._writer.add_image(
+                    f"{tag_base}/{img_type}",
+                    pred_img[img_type],
+                    self.train_iter,
+                    dataformats="HWC",
+                )
+
+            self._log_remaining -= 1
+
+        self._iter += 1
--- a/projects_oss/detr/LICENSE
+++ b/projects_oss/detr/LICENSE
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   DETR
+   Copyright 2020 - present, Facebook, Inc
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+   Deformable DETR
+   Copyright 2020 SenseTime
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
--- a/projects_oss/detr/README.md
+++ b/projects_oss/detr/README.md
+# DETR and Deformable DETR in D2Go
+
+This project extend D2Go with [DETR](https://github.com/facebookresearch/detr) and [Deformable DETR](https://github.com/fundamentalvision/Deformable-DETR) models. The pretrained models with efficient backbone are provided.
+
+## Usage
+
+Please install D2Go following the [instructions](../README.md). Then install this extension:
+
+```bash
+cd projects/detr/
+python setup.py install
+```
+
+### Evaluating Pretrained Models
+
+Please use the `tools/train_net.py` in the main directory as the entry point. The pretrained model can be evaluated using
+
+```bash
+python train_net.py --runner detr.runner.DETRRunner --eval-only --config configs/deformable_detr_fbnetv3a_bs16.yaml  MODEL.WEIGHTS https://mobile-cv.s3-us-west-2.amazonaws.com/d2go/models/252811934/model_final.pth
+```
+
+### Training
+
+Please use the `tools/train_net.py` in the main directory as the entry point and pass the runner as `detr.runner.DETRRunner`.
+
+```bash
+python train_net.py --runner detr.runner.DETRRunner --config configs/deformable_detr_fbnetv3a_bs16.yaml 
+```
+
+### Pretrained Models
+
+| name                                                         | box AP | model id  | download                                                     |
+| ------------------------------------------------------------ | ------ | --------- | ------------------------------------------------------------ |
+| [Deformable-DETR-FBNetV3A](./configs/deformable_detr_fbnetv3a_bs16.yaml) | 27.53  | 252811934 | [model](https://mobile-cv.s3-us-west-2.amazonaws.com/d2go/models/252811934/model_final.pth)\|[mertrics](https://mobile-cv.s3-us-west-2.amazonaws.com/d2go/models/252811934/metrics.json) |
+
--- a/projects_oss/detr/configs/deformable_detr_fbnetv3a_bs16.yaml
+++ b/projects_oss/detr/configs/deformable_detr_fbnetv3a_bs16.yaml
+MODEL:
+  META_ARCHITECTURE: "Detr"
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  MASK_ON: False
+  BACKBONE:
+    NAME: "FBNetV2C4Backbone"
+  FBNET_V2:
+    ARCH: "FBNetV3_A_dsmask_C5"
+    NORM: "sync_bn"
+    WIDTH_DIVISOR: 8
+    SCALE_FACTOR: 1.0
+    OUT_FEATURES: ["trunk4"]
+  DETR:
+    NUM_CLASSES: 80
+    DEFORMABLE: True
+    CLS_WEIGHT: 2.0
+    DIM_FEEDFORWARD: 1024
+    GIOU_WEIGHT: 2.0
+    L1_WEIGHT: 5.0
+    NUM_OBJECT_QUERIES: 300
+    CENTERED_POSITION_ENCODIND: True
+    USE_FOCAL_LOSS: True
+    NUM_FEATURE_LEVELS: 1
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+SOLVER:
+  IMS_PER_BATCH: 16
+  BASE_LR: 0.0002
+  STEPS: (887040,)
+  MAX_ITER: 1108800
+  WARMUP_FACTOR: 1.0
+  WARMUP_ITERS: 10
+  WEIGHT_DECAY: 0.0001
+  OPTIMIZER: "ADAMW"
+  CLIP_GRADIENTS:
+    ENABLED: True
+    CLIP_TYPE: "full_model"
+    CLIP_VALUE: 0.1
+    NORM_TYPE: 2.0
+  LR_MULTIPLIER_OVERWRITE: [{'backbone': 0.1}, {'reference_points': 0.1, 'sampling_offsets': 0.1}]
+INPUT:
+  MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800)
+  CROP:
+    ENABLED: True
+    TYPE: "absolute_range"
+    SIZE: (384, 600)
+  FORMAT: "RGB"
+D2GO_DATA:
+  MAPPER:
+    NAME: "DETRDatasetMapper"
+TEST:
+  EVAL_PERIOD: 4000
+DATALOADER:
+  FILTER_EMPTY_ANNOTATIONS: False
+  NUM_WORKERS: 4
+VERSION: 2
+
--- a/projects_oss/detr/detr/__init__.py
+++ b/projects_oss/detr/detr/__init__.py
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from . import models, util, datasets
+
+__all__ = ['models', 'util', 'datasets']
--- a/projects_oss/detr/detr/d2/__init__.py
+++ b/projects_oss/detr/detr/d2/__init__.py
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from .config import add_detr_config
+from .detr import Detr
+from .dataset_mapper import DetrDatasetMapper
+
+__all__ = ['add_detr_config', 'Detr', 'DetrDatasetMapper']
--- a/projects_oss/detr/detr/d2/config.py
+++ b/projects_oss/detr/detr/d2/config.py
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from detectron2.config import CfgNode as CN
+
+
+def add_detr_config(cfg):
+    """
+    Add config for DETR.
+    """
+    cfg.MODEL.DETR = CN()
+    cfg.MODEL.DETR.NUM_CLASSES = 80
+
+    # FBNet
+    cfg.MODEL.FBNET_V2.OUT_FEATURES = ["trunk3"]
+
+    # For Segmentation
+    cfg.MODEL.DETR.FROZEN_WEIGHTS = ''
+
+    # LOSS
+    cfg.MODEL.DETR.DEFORMABLE = False
+    cfg.MODEL.DETR.USE_FOCAL_LOSS = False
+    cfg.MODEL.DETR.CENTERED_POSITION_ENCODIND = False
+    cfg.MODEL.DETR.CLS_WEIGHT = 1.0
+    cfg.MODEL.DETR.NUM_FEATURE_LEVELS = 4
+    cfg.MODEL.DETR.GIOU_WEIGHT = 2.0
+    cfg.MODEL.DETR.L1_WEIGHT = 5.0
+    cfg.MODEL.DETR.DEEP_SUPERVISION = True
+    cfg.MODEL.DETR.NO_OBJECT_WEIGHT = 0.1
+
+
+    # TRANSFORMER
+    cfg.MODEL.DETR.NHEADS = 8
+    cfg.MODEL.DETR.DROPOUT = 0.1
+    cfg.MODEL.DETR.DIM_FEEDFORWARD = 2048
+    cfg.MODEL.DETR.ENC_LAYERS = 6
+    cfg.MODEL.DETR.DEC_LAYERS = 6
+    cfg.MODEL.DETR.PRE_NORM = False
+
+    cfg.MODEL.DETR.HIDDEN_DIM = 256
+    cfg.MODEL.DETR.NUM_OBJECT_QUERIES = 100
+
+    cfg.SOLVER.OPTIMIZER = "ADAMW"
+    cfg.SOLVER.BACKBONE_MULTIPLIER = 0.1
--- a/projects_oss/detr/detr/d2/dataset_mapper.py
+++ b/projects_oss/detr/detr/d2/dataset_mapper.py
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import copy
+import logging
+
+import numpy as np
+import torch
+
+from detectron2.data import detection_utils as utils
+from detectron2.data import transforms as T
+
+__all__ = ["DetrDatasetMapper"]
+
+
+def build_transform_gen(cfg, is_train):
+    """
+    Create a list of :class:`TransformGen` from config.
+    Returns:
+        list[TransformGen]
+    """
+    if is_train:
+        min_size = cfg.INPUT.MIN_SIZE_TRAIN
+        max_size = cfg.INPUT.MAX_SIZE_TRAIN
+        sample_style = cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING
+    else:
+        min_size = cfg.INPUT.MIN_SIZE_TEST
+        max_size = cfg.INPUT.MAX_SIZE_TEST
+        sample_style = "choice"
+    if sample_style == "range":
+        assert len(min_size) == 2, "more than 2 ({}) min_size(s) are provided for ranges".format(len(min_size))
+
+    logger = logging.getLogger(__name__)
+    tfm_gens = []
+    if is_train:
+        tfm_gens.append(T.RandomFlip())
+    tfm_gens.append(T.ResizeShortestEdge(min_size, max_size, sample_style))
+    if is_train:
+        logger.info("TransformGens used in training: " + str(tfm_gens))
+    return tfm_gens
+
+
+class DetrDatasetMapper:
+    """
+    A callable which takes a dataset dict in Detectron2 Dataset format,
+    and map it into a format used by DETR.
+
+    The callable currently does the following:
+
+    1. Read the image from "file_name"
+    2. Applies geometric transforms to the image and annotation
+    3. Find and applies suitable cropping to the image and annotation
+    4. Prepare image and annotation to Tensors
+    """
+
+    def __init__(self, cfg, is_train=True):
+        if cfg.INPUT.CROP.ENABLED and is_train:
+            self.crop_gen = [
+                T.ResizeShortestEdge([400, 500, 600], sample_style="choice"),
+                T.RandomCrop(cfg.INPUT.CROP.TYPE, cfg.INPUT.CROP.SIZE),
+            ]
+        else:
+            self.crop_gen = None
+
+        self.mask_on = cfg.MODEL.MASK_ON
+        self.tfm_gens = build_transform_gen(cfg, is_train)
+        logging.getLogger(__name__).info(
+            "Full TransformGens used in training: {}, crop: {}".format(str(self.tfm_gens), str(self.crop_gen))
+        )
+
+        self.img_format = cfg.INPUT.FORMAT
+        self.is_train = is_train
+
+    def __call__(self, dataset_dict):
+        """
+        Args:
+            dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
+
+        Returns:
+            dict: a format that builtin models in detectron2 accept
+        """
+        dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
+        image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
+        utils.check_image_size(dataset_dict, image)
+
+        if self.crop_gen is None:
+            image, transforms = T.apply_transform_gens(self.tfm_gens, image)
+        else:
+            if np.random.rand() > 0.5:
+                image, transforms = T.apply_transform_gens(self.tfm_gens, image)
+            else:
+                image, transforms = T.apply_transform_gens(
+                    self.tfm_gens[:-1] + self.crop_gen + self.tfm_gens[-1:], image
+                )
+
+        image_shape = image.shape[:2]  # h, w
+
+        # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
+        # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
+        # Therefore it's important to use torch.Tensor.
+        dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
+
+        if not self.is_train:
+            # USER: Modify this if you want to keep them for some reason.
+            dataset_dict.pop("annotations", None)
+            return dataset_dict
+
+        if "annotations" in dataset_dict:
+            # USER: Modify this if you want to keep them for some reason.
+            for anno in dataset_dict["annotations"]:
+                if not self.mask_on:
+                    anno.pop("segmentation", None)
+                anno.pop("keypoints", None)
+
+            # USER: Implement additional transformations if you have other types of data
+            annos = [
+                utils.transform_instance_annotations(obj, transforms, image_shape)
+                for obj in dataset_dict.pop("annotations")
+                if obj.get("iscrowd", 0) == 0
+            ]
+            instances = utils.annotations_to_instances(annos, image_shape)
+            dataset_dict["instances"] = utils.filter_empty_instances(instances)
+        return dataset_dict
--- a/projects_oss/detr/detr/d2/detr.py
+++ b/projects_oss/detr/detr/d2/detr.py
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from detectron2.layers import ShapeSpec
+from detectron2.modeling import META_ARCH_REGISTRY, build_backbone, detector_postprocess
+from detectron2.structures import Boxes, ImageList, Instances, BitMasks
+from detr.models.backbone import Joiner
+from detr.models.detr import DETR
+from detr.models.deformable_detr import DeformableDETR
+from detr.models.setcriterion import SetCriterion, FocalLossSetCriterion
+from detr.models.matcher import HungarianMatcher
+from detr.models.position_encoding import PositionEmbeddingSine
+from detr.models.transformer import Transformer
+from detr.models.deformable_transformer import DeformableTransformer
+from detr.models.segmentation import DETRsegm, PostProcessSegm
+from detr.util.box_ops import box_cxcywh_to_xyxy, box_xyxy_to_cxcywh
+from detr.util.misc import NestedTensor
+from detr.datasets.coco import convert_coco_poly_to_mask
+
+__all__ = ["Detr"]
+
+
+class ResNetMaskedBackbone(nn.Module):
+    """ This is a thin wrapper around D2's backbone to provide padding masking"""
+
+    def __init__(self, cfg):
+        super().__init__()
+        self.backbone = build_backbone(cfg)
+        backbone_shape = self.backbone.output_shape()
+        if cfg.MODEL.DETR.NUM_FEATURE_LEVELS > 1:
+            self.strides = [8, 16, 32]
+        else:
+            self.strides = [32]
+
+        if cfg.MODEL.RESNETS.RES5_DILATION == 2:
+            # fix dilation from d2
+            self.backbone.stages[-1][0].conv2.dilation = (1, 1)
+            self.backbone.stages[-1][0].conv2.padding = (1, 1)
+            self.strides[-1] = self.strides[-1] // 2
+
+        self.feature_strides = [backbone_shape[f].stride for f in backbone_shape.keys()]
+        self.num_channels = [backbone_shape[k].channels for k in backbone_shape.keys()]
+
+    def forward(self, images):
+        features = self.backbone(images.tensor)
+        masks = self.mask_out_padding(
+            [features_per_level.shape for features_per_level in features.values()],
+            images.image_sizes,
+            images.tensor.device,
+        )
+        assert len(features) == len(masks)
+        for i, k in enumerate(features.keys()):
+            features[k] = NestedTensor(features[k], masks[i])
+        return features
+
+    def mask_out_padding(self, feature_shapes, image_sizes, device):
+        masks = []
+        assert len(feature_shapes) == len(self.feature_strides)
+        for idx, shape in enumerate(feature_shapes):
+            N, _, H, W = shape
+            masks_per_feature_level = torch.ones((N, H, W), dtype=torch.bool, device=device)
+            for img_idx, (h, w) in enumerate(image_sizes):
+                masks_per_feature_level[
+                    img_idx,
+                    : int(np.ceil(float(h) / self.feature_strides[idx])),
+                    : int(np.ceil(float(w) / self.feature_strides[idx])),
+                ] = 0
+            masks.append(masks_per_feature_level)
+        return masks
+
+class FBNetMaskedBackbone(nn.Module):
+    """ This is a thin wrapper around D2's backbone to provide padding masking"""
+
+    def __init__(self, cfg):
+        super().__init__()
+        self.backbone = build_backbone(cfg)
+        self.out_features = cfg.MODEL.FBNET_V2.OUT_FEATURES
+        self.feature_strides = list(self.backbone._out_feature_strides.values())
+        self.num_channels = [self.backbone._out_feature_channels[k] for k in self.out_features]
+        self.strides = [self.backbone._out_feature_strides[k] for k in self.out_features]
+
+    def forward(self, images):
+        features = self.backbone(images.tensor)
+        masks = self.mask_out_padding(
+            [features_per_level.shape for features_per_level in features.values()],
+            images.image_sizes,
+            images.tensor.device,
+        )
+        assert len(features) == len(masks)
+        ret_features = {}
+        for i, k in enumerate(features.keys()):
+            if k in self.out_features:
+                ret_features[k] = NestedTensor(features[k], masks[i])
+        return ret_features
+
+    def mask_out_padding(self, feature_shapes, image_sizes, device):
+        masks = []
+        assert len(feature_shapes) == len(self.feature_strides)
+        for idx, shape in enumerate(feature_shapes):
+            N, _, H, W = shape
+            masks_per_feature_level = torch.ones((N, H, W), dtype=torch.bool, device=device)
+            for img_idx, (h, w) in enumerate(image_sizes):
+                masks_per_feature_level[
+                    img_idx,
+                    : int(np.ceil(float(h) / self.feature_strides[idx])),
+                    : int(np.ceil(float(w) / self.feature_strides[idx])),
+                ] = 0
+            masks.append(masks_per_feature_level)
+        return masks
+
+
+@META_ARCH_REGISTRY.register()
+class Detr(nn.Module):
+    """
+    Implement Detr
+    """
+
+    def __init__(self, cfg):
+        super().__init__()
+
+        self.device = torch.device(cfg.MODEL.DEVICE)
+
+        self.num_classes = cfg.MODEL.DETR.NUM_CLASSES
+        self.mask_on = cfg.MODEL.MASK_ON
+        hidden_dim = cfg.MODEL.DETR.HIDDEN_DIM
+        num_queries = cfg.MODEL.DETR.NUM_OBJECT_QUERIES
+        # Transformer parameters:
+        nheads = cfg.MODEL.DETR.NHEADS
+        dropout = cfg.MODEL.DETR.DROPOUT
+        dim_feedforward = cfg.MODEL.DETR.DIM_FEEDFORWARD
+        enc_layers = cfg.MODEL.DETR.ENC_LAYERS
+        dec_layers = cfg.MODEL.DETR.DEC_LAYERS
+        pre_norm = cfg.MODEL.DETR.PRE_NORM
+
+        # Loss parameters:
+        giou_weight = cfg.MODEL.DETR.GIOU_WEIGHT
+        l1_weight = cfg.MODEL.DETR.L1_WEIGHT
+        cls_weight = cfg.MODEL.DETR.CLS_WEIGHT
+        deep_supervision = cfg.MODEL.DETR.DEEP_SUPERVISION
+        no_object_weight = cfg.MODEL.DETR.NO_OBJECT_WEIGHT
+        centered_position_encoding = cfg.MODEL.DETR.CENTERED_POSITION_ENCODIND
+        num_feature_levels = cfg.MODEL.DETR.NUM_FEATURE_LEVELS
+
+        N_steps = hidden_dim // 2
+        if 'resnet' in cfg.MODEL.BACKBONE.NAME.lower():
+            d2_backbone = ResNetMaskedBackbone(cfg)
+        elif 'fbnet' in cfg.MODEL.BACKBONE.NAME.lower():
+            d2_backbone =FBNetMaskedBackbone(cfg)
+        else:
+            raise NotImplementedError
+
+        backbone = Joiner(d2_backbone, PositionEmbeddingSine(N_steps, normalize=True, centered=centered_position_encoding))
+        backbone.num_channels = d2_backbone.num_channels
+        self.use_focal_loss = cfg.MODEL.DETR.USE_FOCAL_LOSS
+
+        if cfg.MODEL.DETR.DEFORMABLE:
+            transformer = DeformableTransformer(
+                d_model=hidden_dim,
+                nhead=nheads,
+                num_encoder_layers=enc_layers,
+                num_decoder_layers=dec_layers,
+                dim_feedforward=dim_feedforward,
+                dropout=dropout,
+                activation="relu",
+                return_intermediate_dec=True,
+                num_feature_levels=num_feature_levels,
+                dec_n_points=4,
+                enc_n_points=4,
+                two_stage=False,
+                two_stage_num_proposals=num_queries,
+            )
+
+            self.detr = DeformableDETR(
+                backbone, transformer, num_classes=self.num_classes, num_queries=num_queries,
+                num_feature_levels=num_feature_levels, aux_loss=deep_supervision,
+            )
+        else:
+            transformer = Transformer(
+                d_model=hidden_dim,
+                dropout=dropout,
+                nhead=nheads,
+                dim_feedforward=dim_feedforward,
+                num_encoder_layers=enc_layers,
+                num_decoder_layers=dec_layers,
+                normalize_before=pre_norm,
+                return_intermediate_dec=deep_supervision,
+            )
+
+            self.detr = DETR(
+                backbone, transformer, num_classes=self.num_classes, num_queries=num_queries,
+                aux_loss=deep_supervision, use_focal_loss=self.use_focal_loss,
+            )
+        if self.mask_on:
+            frozen_weights = cfg.MODEL.DETR.FROZEN_WEIGHTS
+            if frozen_weights != '':
+                print("LOAD pre-trained weights")
+                weight = torch.load(frozen_weights, map_location=lambda storage, loc: storage)['model']
+                new_weight = {}
+                for k, v in weight.items():
+                    if 'detr.' in k:
+                        new_weight[k.replace('detr.', '')] = v
+                    else:
+                        print(f"Skipping loading weight {k} from frozen model")
+                del weight
+                self.detr.load_state_dict(new_weight)
+                del new_weight
+            self.detr = DETRsegm(self.detr, freeze_detr=(frozen_weights != ''))
+            self.seg_postprocess = PostProcessSegm
+
+        self.detr.to(self.device)
+
+        # building criterion
+        matcher = HungarianMatcher(cost_class=cls_weight, cost_bbox=l1_weight,
+                                   cost_giou=giou_weight, use_focal_loss=self.use_focal_loss)
+        weight_dict = {"loss_ce": cls_weight, "loss_bbox": l1_weight}
+        weight_dict["loss_giou"] = giou_weight
+        if deep_supervision:
+            aux_weight_dict = {}
+            for i in range(dec_layers - 1):
+                aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()})
+            weight_dict.update(aux_weight_dict)
+        losses = ["labels", "boxes", "cardinality"]
+        if self.mask_on:
+            losses += ["masks"]
+        if self.use_focal_loss:
+            self.criterion = FocalLossSetCriterion(
+                self.num_classes, matcher=matcher, weight_dict=weight_dict, losses=losses,
+            )
+        else:
+            self.criterion = SetCriterion(
+                self.num_classes, matcher=matcher, weight_dict=weight_dict, eos_coef=no_object_weight, losses=losses,
+            )
+        self.criterion.to(self.device)
+
+        pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view(3, 1, 1)
+        pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view(3, 1, 1)
+        self.normalizer = lambda x: (x - pixel_mean) / pixel_std
+        self.to(self.device)
+
+    def forward(self, batched_inputs):
+        """
+        Args:
+            batched_inputs: a list, batched outputs of :class:`DatasetMapper` .
+                Each item in the list contains the inputs for one image.
+                For now, each item in the list is a dict that contains:
+
+                * image: Tensor, image in (C, H, W) format.
+                * instances: Instances
+
+                Other information that's included in the original dicts, such as:
+
+                * "height", "width" (int): the output resolution of the model, used in inference.
+                  See :meth:`postprocess` for details.
+        Returns:
+            dict[str: Tensor]:
+                mapping from a named loss to a tensor storing the loss. Used during training only.
+        """
+        images = self.preprocess_image(batched_inputs)
+        output = self.detr(images)
+
+        if self.training:
+            gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
+
+            targets = self.prepare_targets(gt_instances)
+            loss_dict = self.criterion(output, targets)
+            weight_dict = self.criterion.weight_dict
+            for k in loss_dict.keys():
+                if k in weight_dict:
+                    loss_dict[k] *= weight_dict[k]
+            return loss_dict
+        else:
+            box_cls = output["pred_logits"]
+            box_pred = output["pred_boxes"]
+            mask_pred = output["pred_masks"] if self.mask_on else None
+            results = self.inference(box_cls, box_pred, mask_pred, images.image_sizes)
+            processed_results = []
+            for results_per_image, input_per_image, image_size in zip(results, batched_inputs, images.image_sizes):
+                height = input_per_image.get("height", image_size[0])
+                width = input_per_image.get("width", image_size[1])
+                r = detector_postprocess(results_per_image, height, width)
+                processed_results.append({"instances": r})
+            return processed_results
+
+    def prepare_targets(self, targets):
+        new_targets = []
+        for targets_per_image in targets:
+            h, w = targets_per_image.image_size
+            image_size_xyxy = torch.as_tensor([w, h, w, h], dtype=torch.float, device=self.device)
+            gt_classes = targets_per_image.gt_classes
+            gt_boxes = targets_per_image.gt_boxes.tensor / image_size_xyxy
+            gt_boxes = box_xyxy_to_cxcywh(gt_boxes)
+            new_targets.append({"labels": gt_classes, "boxes": gt_boxes})
+            if self.mask_on and hasattr(targets_per_image, 'gt_masks'):
+                gt_masks = targets_per_image.gt_masks
+                gt_masks = convert_coco_poly_to_mask(gt_masks.polygons, h, w)
+                new_targets[-1].update({'masks': gt_masks})
+        return new_targets
+
+    def inference(self, box_cls, box_pred, mask_pred, image_sizes):
+        """
+        Arguments:
+            box_cls (Tensor): tensor of shape (batch_size, num_queries, K).
+                The tensor predicts the classification probability for each query.
+            box_pred (Tensor): tensors of shape (batch_size, num_queries, 4).
+                The tensor predicts 4-vector (x,y,w,h) box
+                regression values for every queryx
+            image_sizes (List[torch.Size]): the input image sizes
+
+        Returns:
+            results (List[Instances]): a list of #images elements.
+        """
+        assert len(box_cls) == len(image_sizes)
+        results = []
+
+        # For each box we assign the best class or the second best if the best on is `no_object`.
+        if self.use_focal_loss:
+            prob = box_cls.sigmoid()
+            # TODO make top-100 as an option for non-focal-loss as well
+            scores, topk_indexes = torch.topk(prob.view(box_cls.shape[0], -1), 100, dim=1)
+            topk_boxes = topk_indexes // box_cls.shape[2]
+            labels = topk_indexes % box_cls.shape[2]
+        else:
+            scores, labels = F.softmax(box_cls, dim=-1)[:, :, :-1].max(-1)
+
+        for i, (scores_per_image, labels_per_image, box_pred_per_image, image_size) in enumerate(zip(
+            scores, labels, box_pred, image_sizes
+        )):
+            result = Instances(image_size)
+            boxes = box_cxcywh_to_xyxy(box_pred_per_image)
+            if self.use_focal_loss:
+                boxes = torch.gather(boxes.unsqueeze(0), 1, topk_boxes.unsqueeze(-1).repeat(1,1,4)).squeeze()
+            result.pred_boxes = Boxes(boxes)
+
+            result.pred_boxes.scale(scale_x=image_size[1], scale_y=image_size[0])
+            if self.mask_on:
+                mask = F.interpolate(mask_pred[i].unsqueeze(0), size=image_size, mode='bilinear', align_corners=False)
+                mask = mask[0].sigmoid() > 0.5
+                B, N, H, W = mask_pred.shape
+                mask = BitMasks(mask.cpu()).crop_and_resize(result.pred_boxes.tensor.cpu(), 32)
+                result.pred_masks = mask.unsqueeze(1).to(mask_pred[0].device)
+
+            result.scores = scores_per_image
+            result.pred_classes = labels_per_image
+            results.append(result)
+        return results
+
+    def preprocess_image(self, batched_inputs):
+        """
+        Normalize, pad and batch the input images.
+        """
+        images = [self.normalizer(x["image"].to(self.device)) for x in batched_inputs]
+        images = ImageList.from_tensors(images)
+        return images
--- a/projects_oss/detr/detr/datasets/__init__.py
+++ b/projects_oss/detr/detr/datasets/__init__.py
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import torch.utils.data
+import torchvision
+
+from .coco import build as build_coco
+
+
+def get_coco_api_from_dataset(dataset):
+    for _ in range(10):
+        # if isinstance(dataset, torchvision.datasets.CocoDetection):
+        #     break
+        if isinstance(dataset, torch.utils.data.Subset):
+            dataset = dataset.dataset
+    if isinstance(dataset, torchvision.datasets.CocoDetection):
+        return dataset.coco
+
+
+def build_dataset(image_set, args):
+    if args.dataset_file == 'coco':
+        return build_coco(image_set, args)
+    if args.dataset_file == 'coco_panoptic':
+        # to avoid making panopticapi required for coco
+        from .coco_panoptic import build as build_coco_panoptic
+        return build_coco_panoptic(image_set, args)
+    raise ValueError(f'dataset {args.dataset_file} not supported')
--- a/projects_oss/detr/detr/datasets/coco.py
+++ b/projects_oss/detr/detr/datasets/coco.py
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+COCO dataset which returns image_id for evaluation.
+
+Mostly copy-paste from https://github.com/pytorch/vision/blob/13b35ff/references/detection/coco_utils.py
+"""
+from pathlib import Path
+
+import torch
+import torch.utils.data
+import torchvision
+from pycocotools import mask as coco_mask
+
+import detr.datasets.transforms as T
+
+
+class CocoDetection(torchvision.datasets.CocoDetection):
+    def __init__(self, img_folder, ann_file, transforms, return_masks):
+        super(CocoDetection, self).__init__(img_folder, ann_file)
+        self._transforms = transforms
+        self.prepare = ConvertCocoPolysToMask(return_masks)
+
+    def __getitem__(self, idx):
+        img, target = super(CocoDetection, self).__getitem__(idx)
+        image_id = self.ids[idx]
+        target = {'image_id': image_id, 'annotations': target}
+        img, target = self.prepare(img, target)
+        if self._transforms is not None:
+            img, target = self._transforms(img, target)
+        return img, target
+
+
+def convert_coco_poly_to_mask(segmentations, height, width):
+    masks = []
+    for polygons in segmentations:
+        rles = coco_mask.frPyObjects(polygons, height, width)
+        mask = coco_mask.decode(rles)
+        if len(mask.shape) < 3:
+            mask = mask[..., None]
+        mask = torch.as_tensor(mask, dtype=torch.uint8)
+        mask = mask.any(dim=2)
+        masks.append(mask)
+    if masks:
+        masks = torch.stack(masks, dim=0)
+    else:
+        masks = torch.zeros((0, height, width), dtype=torch.uint8)
+    return masks
+
+
+class ConvertCocoPolysToMask(object):
+    def __init__(self, return_masks=False):
+        self.return_masks = return_masks
+
+    def __call__(self, image, target):
+        w, h = image.size
+
+        image_id = target["image_id"]
+        image_id = torch.tensor([image_id])
+
+        anno = target["annotations"]
+
+        anno = [obj for obj in anno if 'iscrowd' not in obj or obj['iscrowd'] == 0]
+
+        boxes = [obj["bbox"] for obj in anno]
+        # guard against no boxes via resizing
+        boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
+        boxes[:, 2:] += boxes[:, :2]
+        boxes[:, 0::2].clamp_(min=0, max=w)
+        boxes[:, 1::2].clamp_(min=0, max=h)
+
+        classes = [obj["category_id"] for obj in anno]
+        classes = torch.tensor(classes, dtype=torch.int64)
+
+        if self.return_masks:
+            segmentations = [obj["segmentation"] for obj in anno]
+            masks = convert_coco_poly_to_mask(segmentations, h, w)
+
+        keypoints = None
+        if anno and "keypoints" in anno[0]:
+            keypoints = [obj["keypoints"] for obj in anno]
+            keypoints = torch.as_tensor(keypoints, dtype=torch.float32)
+            num_keypoints = keypoints.shape[0]
+            if num_keypoints:
+                keypoints = keypoints.view(num_keypoints, -1, 3)
+
+        keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+        boxes = boxes[keep]
+        classes = classes[keep]
+        if self.return_masks:
+            masks = masks[keep]
+        if keypoints is not None:
+            keypoints = keypoints[keep]
+
+        target = {}
+        target["boxes"] = boxes
+        target["labels"] = classes
+        if self.return_masks:
+            target["masks"] = masks
+        target["image_id"] = image_id
+        if keypoints is not None:
+            target["keypoints"] = keypoints
+
+        # for conversion to coco api
+        area = torch.tensor([obj["area"] for obj in anno])
+        iscrowd = torch.tensor([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno])
+        target["area"] = area[keep]
+        target["iscrowd"] = iscrowd[keep]
+
+        target["orig_size"] = torch.as_tensor([int(h), int(w)])
+        target["size"] = torch.as_tensor([int(h), int(w)])
+
+        return image, target
+
+
+def make_coco_transforms(image_set):
+
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+
+    scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800]
+
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.RandomSelect(
+                T.RandomResize(scales, max_size=1333),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=1333),
+                ])
+            ),
+            normalize,
+        ])
+
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([800], max_size=1333),
+            normalize,
+        ])
+
+    raise ValueError(f'unknown {image_set}')
+
+
+def build(image_set, args):
+    root = Path(args.coco_path)
+    assert root.exists(), f'provided COCO path {root} does not exist'
+    mode = 'instances'
+    PATHS = {
+        "train": (root / "train2017", root / "annotations" / f'{mode}_train2017.json'),
+        "val": (root / "val2017", root / "annotations" / f'{mode}_val2017.json'),
+    }
+
+    img_folder, ann_file = PATHS[image_set]
+    dataset = CocoDetection(img_folder, ann_file, transforms=make_coco_transforms(image_set), return_masks=args.masks)
+    return dataset