catvton

3144257c · mashun1 · 3144257c · 3144257c · 3144257c · 3144257c
Commit 3144257c authored Oct 11, 2024 by mashun1
20 changed files
--- a/detectron2/export/torchscript.py
+++ b/detectron2/export/torchscript.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import os
+import torch
+
+from detectron2.utils.file_io import PathManager
+
+from .torchscript_patch import freeze_training_mode, patch_instances
+
+__all__ = ["scripting_with_instances", "dump_torchscript_IR"]
+
+
+def scripting_with_instances(model, fields):
+    """
+    Run :func:`torch.jit.script` on a model that uses the :class:`Instances` class. Since
+    attributes of :class:`Instances` are "dynamically" added in eager mode，it is difficult
+    for scripting to support it out of the box. This function is made to support scripting
+    a model that uses :class:`Instances`. It does the following:
+
+    1. Create a scriptable ``new_Instances`` class which behaves similarly to ``Instances``,
+       but with all attributes been "static".
+       The attributes need to be statically declared in the ``fields`` argument.
+    2. Register ``new_Instances``, and force scripting compiler to
+       use it when trying to compile ``Instances``.
+
+    After this function, the process will be reverted. User should be able to script another model
+    using different fields.
+
+    Example:
+        Assume that ``Instances`` in the model consist of two attributes named
+        ``proposal_boxes`` and ``objectness_logits`` with type :class:`Boxes` and
+        :class:`Tensor` respectively during inference. You can call this function like:
+        ::
+            fields = {"proposal_boxes": Boxes, "objectness_logits": torch.Tensor}
+            torchscipt_model =  scripting_with_instances(model, fields)
+
+    Note:
+        It only support models in evaluation mode.
+
+    Args:
+        model (nn.Module): The input model to be exported by scripting.
+        fields (Dict[str, type]): Attribute names and corresponding type that
+            ``Instances`` will use in the model. Note that all attributes used in ``Instances``
+            need to be added, regardless of whether they are inputs/outputs of the model.
+            Data type not defined in detectron2 is not supported for now.
+
+    Returns:
+        torch.jit.ScriptModule: the model in torchscript format
+    """
+    assert (
+        not model.training
+    ), "Currently we only support exporting models in evaluation mode to torchscript"
+
+    with freeze_training_mode(model), patch_instances(fields):
+        scripted_model = torch.jit.script(model)
+        return scripted_model
+
+
+# alias for old name
+export_torchscript_with_instances = scripting_with_instances
+
+
+def dump_torchscript_IR(model, dir):
+    """
+    Dump IR of a TracedModule/ScriptModule/Function in various format (code, graph,
+    inlined graph). Useful for debugging.
+
+    Args:
+        model (TracedModule/ScriptModule/ScriptFUnction): traced or scripted module
+        dir (str): output directory to dump files.
+    """
+    dir = os.path.expanduser(dir)
+    PathManager.mkdirs(dir)
+
+    def _get_script_mod(mod):
+        if isinstance(mod, torch.jit.TracedModule):
+            return mod._actual_script_module
+        return mod
+
+    # Dump pretty-printed code: https://pytorch.org/docs/stable/jit.html#inspecting-code
+    with PathManager.open(os.path.join(dir, "model_ts_code.txt"), "w") as f:
+
+        def get_code(mod):
+            # Try a few ways to get code using private attributes.
+            try:
+                # This contains more information than just `mod.code`
+                return _get_script_mod(mod)._c.code
+            except AttributeError:
+                pass
+            try:
+                return mod.code
+            except AttributeError:
+                return None
+
+        def dump_code(prefix, mod):
+            code = get_code(mod)
+            name = prefix or "root model"
+            if code is None:
+                f.write(f"Could not found code for {name} (type={mod.original_name})\n")
+                f.write("\n")
+            else:
+                f.write(f"\nCode for {name}, type={mod.original_name}:\n")
+                f.write(code)
+                f.write("\n")
+                f.write("-" * 80)
+
+            for name, m in mod.named_children():
+                dump_code(prefix + "." + name, m)
+
+        if isinstance(model, torch.jit.ScriptFunction):
+            f.write(get_code(model))
+        else:
+            dump_code("", model)
+
+    def _get_graph(model):
+        try:
+            # Recursively dump IR of all modules
+            return _get_script_mod(model)._c.dump_to_str(True, False, False)
+        except AttributeError:
+            return model.graph.str()
+
+    with PathManager.open(os.path.join(dir, "model_ts_IR.txt"), "w") as f:
+        f.write(_get_graph(model))
+
+    # Dump IR of the entire graph (all submodules inlined)
+    with PathManager.open(os.path.join(dir, "model_ts_IR_inlined.txt"), "w") as f:
+        f.write(str(model.inlined_graph))
+
+    if not isinstance(model, torch.jit.ScriptFunction):
+        # Dump the model structure in pytorch style
+        with PathManager.open(os.path.join(dir, "model.txt"), "w") as f:
+            f.write(str(model))
--- a/detectron2/export/torchscript_patch.py
+++ b/detectron2/export/torchscript_patch.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import os
+import sys
+import tempfile
+from contextlib import ExitStack, contextmanager
+from copy import deepcopy
+from unittest import mock
+import torch
+from torch import nn
+
+# need some explicit imports due to https://github.com/pytorch/pytorch/issues/38964
+import detectron2  # noqa F401
+from detectron2.structures import Boxes, Instances
+from detectron2.utils.env import _import_file
+
+_counter = 0
+
+
+def _clear_jit_cache():
+    from torch.jit._recursive import concrete_type_store
+    from torch.jit._state import _jit_caching_layer
+
+    concrete_type_store.type_store.clear()  # for modules
+    _jit_caching_layer.clear()  # for free functions
+
+
+def _add_instances_conversion_methods(newInstances):
+    """
+    Add from_instances methods to the scripted Instances class.
+    """
+    cls_name = newInstances.__name__
+
+    @torch.jit.unused
+    def from_instances(instances: Instances):
+        """
+        Create scripted Instances from original Instances
+        """
+        fields = instances.get_fields()
+        image_size = instances.image_size
+        ret = newInstances(image_size)
+        for name, val in fields.items():
+            assert hasattr(ret, f"_{name}"), f"No attribute named {name} in {cls_name}"
+            setattr(ret, name, deepcopy(val))
+        return ret
+
+    newInstances.from_instances = from_instances
+
+
+@contextmanager
+def patch_instances(fields):
+    """
+    A contextmanager, under which the Instances class in detectron2 is replaced
+    by a statically-typed scriptable class, defined by `fields`.
+    See more in `scripting_with_instances`.
+    """
+
+    with tempfile.TemporaryDirectory(prefix="detectron2") as dir, tempfile.NamedTemporaryFile(
+        mode="w", encoding="utf-8", suffix=".py", dir=dir, delete=False
+    ) as f:
+        try:
+            # Objects that use Instances should not reuse previously-compiled
+            # results in cache, because `Instances` could be a new class each time.
+            _clear_jit_cache()
+
+            cls_name, s = _gen_instance_module(fields)
+            f.write(s)
+            f.flush()
+            f.close()
+
+            module = _import(f.name)
+            new_instances = getattr(module, cls_name)
+            _ = torch.jit.script(new_instances)
+            # let torchscript think Instances was scripted already
+            Instances.__torch_script_class__ = True
+            # let torchscript find new_instances when looking for the jit type of Instances
+            Instances._jit_override_qualname = torch._jit_internal._qualified_name(new_instances)
+
+            _add_instances_conversion_methods(new_instances)
+            yield new_instances
+        finally:
+            try:
+                del Instances.__torch_script_class__
+                del Instances._jit_override_qualname
+            except AttributeError:
+                pass
+            sys.modules.pop(module.__name__)
+
+
+def _gen_instance_class(fields):
+    """
+    Args:
+        fields (dict[name: type])
+    """
+
+    class _FieldType:
+        def __init__(self, name, type_):
+            assert isinstance(name, str), f"Field name must be str, got {name}"
+            self.name = name
+            self.type_ = type_
+            self.annotation = f"{type_.__module__}.{type_.__name__}"
+
+    fields = [_FieldType(k, v) for k, v in fields.items()]
+
+    def indent(level, s):
+        return " " * 4 * level + s
+
+    lines = []
+
+    global _counter
+    _counter += 1
+
+    cls_name = "ScriptedInstances{}".format(_counter)
+
+    field_names = tuple(x.name for x in fields)
+    extra_args = ", ".join([f"{f.name}: Optional[{f.annotation}] = None" for f in fields])
+    lines.append(
+        f"""
+class {cls_name}:
+    def __init__(self, image_size: Tuple[int, int], {extra_args}):
+        self.image_size = image_size
+        self._field_names = {field_names}
+"""
+    )
+
+    for f in fields:
+        lines.append(
+            indent(2, f"self._{f.name} = torch.jit.annotate(Optional[{f.annotation}], {f.name})")
+        )
+
+    for f in fields:
+        lines.append(
+            f"""
+    @property
+    def {f.name}(self) -> {f.annotation}:
+        # has to use a local for type refinement
+        # https://pytorch.org/docs/stable/jit_language_reference.html#optional-type-refinement
+        t = self._{f.name}
+        assert t is not None, "{f.name} is None and cannot be accessed!"
+        return t
+
+    @{f.name}.setter
+    def {f.name}(self, value: {f.annotation}) -> None:
+        self._{f.name} = value
+"""
+        )
+
+    # support method `__len__`
+    lines.append(
+        """
+    def __len__(self) -> int:
+"""
+    )
+    for f in fields:
+        lines.append(
+            f"""
+        t = self._{f.name}
+        if t is not None:
+            return len(t)
+"""
+        )
+    lines.append(
+        """
+        raise NotImplementedError("Empty Instances does not support __len__!")
+"""
+    )
+
+    # support method `has`
+    lines.append(
+        """
+    def has(self, name: str) -> bool:
+"""
+    )
+    for f in fields:
+        lines.append(
+            f"""
+        if name == "{f.name}":
+            return self._{f.name} is not None
+"""
+        )
+    lines.append(
+        """
+        return False
+"""
+    )
+
+    # support method `to`
+    none_args = ", None" * len(fields)
+    lines.append(
+        f"""
+    def to(self, device: torch.device) -> "{cls_name}":
+        ret = {cls_name}(self.image_size{none_args})
+"""
+    )
+    for f in fields:
+        if hasattr(f.type_, "to"):
+            lines.append(
+                f"""
+        t = self._{f.name}
+        if t is not None:
+            ret._{f.name} = t.to(device)
+"""
+            )
+        else:
+            # For now, ignore fields that cannot be moved to devices.
+            # Maybe can support other tensor-like classes (e.g. __torch_function__)
+            pass
+    lines.append(
+        """
+        return ret
+"""
+    )
+
+    # support method `getitem`
+    none_args = ", None" * len(fields)
+    lines.append(
+        f"""
+    def __getitem__(self, item) -> "{cls_name}":
+        ret = {cls_name}(self.image_size{none_args})
+"""
+    )
+    for f in fields:
+        lines.append(
+            f"""
+        t = self._{f.name}
+        if t is not None:
+            ret._{f.name} = t[item]
+"""
+        )
+    lines.append(
+        """
+        return ret
+"""
+    )
+
+    # support method `cat`
+    # this version does not contain checks that all instances have same size and fields
+    none_args = ", None" * len(fields)
+    lines.append(
+        f"""
+    def cat(self, instances: List["{cls_name}"]) -> "{cls_name}":
+        ret = {cls_name}(self.image_size{none_args})
+"""
+    )
+    for f in fields:
+        lines.append(
+            f"""
+        t = self._{f.name}
+        if t is not None:
+            values: List[{f.annotation}] = [x.{f.name} for x in instances]
+            if torch.jit.isinstance(t, torch.Tensor):
+                ret._{f.name} = torch.cat(values, dim=0)
+            else:
+                ret._{f.name} = t.cat(values)
+"""
+        )
+    lines.append(
+        """
+        return ret"""
+    )
+
+    # support method `get_fields()`
+    lines.append(
+        """
+    def get_fields(self) -> Dict[str, Tensor]:
+        ret = {}
+    """
+    )
+    for f in fields:
+        if f.type_ == Boxes:
+            stmt = "t.tensor"
+        elif f.type_ == torch.Tensor:
+            stmt = "t"
+        else:
+            stmt = f'assert False, "unsupported type {str(f.type_)}"'
+        lines.append(
+            f"""
+        t = self._{f.name}
+        if t is not None:
+            ret["{f.name}"] = {stmt}
+        """
+        )
+    lines.append(
+        """
+        return ret"""
+    )
+    return cls_name, os.linesep.join(lines)
+
+
+def _gen_instance_module(fields):
+    # TODO: find a more automatic way to enable import of other classes
+    s = """
+from copy import deepcopy
+import torch
+from torch import Tensor
+import typing
+from typing import *
+
+import detectron2
+from detectron2.structures import Boxes, Instances
+
+"""
+
+    cls_name, cls_def = _gen_instance_class(fields)
+    s += cls_def
+    return cls_name, s
+
+
+def _import(path):
+    return _import_file(
+        "{}{}".format(sys.modules[__name__].__name__, _counter), path, make_importable=True
+    )
+
+
+@contextmanager
+def patch_builtin_len(modules=()):
+    """
+    Patch the builtin len() function of a few detectron2 modules
+    to use __len__ instead, because __len__ does not convert values to
+    integers and therefore is friendly to tracing.
+
+    Args:
+        modules (list[stsr]): names of extra modules to patch len(), in
+            addition to those in detectron2.
+    """
+
+    def _new_len(obj):
+        return obj.__len__()
+
+    with ExitStack() as stack:
+        MODULES = [
+            "detectron2.modeling.roi_heads.fast_rcnn",
+            "detectron2.modeling.roi_heads.mask_head",
+            "detectron2.modeling.roi_heads.keypoint_head",
+        ] + list(modules)
+        ctxs = [stack.enter_context(mock.patch(mod + ".len")) for mod in MODULES]
+        for m in ctxs:
+            m.side_effect = _new_len
+        yield
+
+
+def patch_nonscriptable_classes():
+    """
+    Apply patches on a few nonscriptable detectron2 classes.
+    Should not have side-effects on eager usage.
+    """
+    # __prepare_scriptable__ can also be added to models for easier maintenance.
+    # But it complicates the clean model code.
+
+    from detectron2.modeling.backbone import ResNet, FPN
+
+    # Due to https://github.com/pytorch/pytorch/issues/36061,
+    # we change backbone to use ModuleList for scripting.
+    # (note: this changes param names in state_dict)
+
+    def prepare_resnet(self):
+        ret = deepcopy(self)
+        ret.stages = nn.ModuleList(ret.stages)
+        for k in self.stage_names:
+            delattr(ret, k)
+        return ret
+
+    ResNet.__prepare_scriptable__ = prepare_resnet
+
+    def prepare_fpn(self):
+        ret = deepcopy(self)
+        ret.lateral_convs = nn.ModuleList(ret.lateral_convs)
+        ret.output_convs = nn.ModuleList(ret.output_convs)
+        for name, _ in self.named_children():
+            if name.startswith("fpn_"):
+                delattr(ret, name)
+        return ret
+
+    FPN.__prepare_scriptable__ = prepare_fpn
+
+    # Annotate some attributes to be constants for the purpose of scripting,
+    # even though they are not constants in eager mode.
+    from detectron2.modeling.roi_heads import StandardROIHeads
+
+    if hasattr(StandardROIHeads, "__annotations__"):
+        # copy first to avoid editing annotations of base class
+        StandardROIHeads.__annotations__ = deepcopy(StandardROIHeads.__annotations__)
+        StandardROIHeads.__annotations__["mask_on"] = torch.jit.Final[bool]
+        StandardROIHeads.__annotations__["keypoint_on"] = torch.jit.Final[bool]
+
+
+# These patches are not supposed to have side-effects.
+patch_nonscriptable_classes()
+
+
+@contextmanager
+def freeze_training_mode(model):
+    """
+    A context manager that annotates the "training" attribute of every submodule
+    to constant, so that the training codepath in these modules can be
+    meta-compiled away. Upon exiting, the annotations are reverted.
+    """
+    classes = {type(x) for x in model.modules()}
+    # __constants__ is the old way to annotate constants and not compatible
+    # with __annotations__ .
+    classes = {x for x in classes if not hasattr(x, "__constants__")}
+    for cls in classes:
+        cls.__annotations__["training"] = torch.jit.Final[bool]
+    yield
+    for cls in classes:
+        cls.__annotations__["training"] = bool
--- a/detectron2/layers/__init__.py
+++ b/detectron2/layers/__init__.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+from .batch_norm import FrozenBatchNorm2d, get_norm, NaiveSyncBatchNorm, CycleBatchNormList
+from .deform_conv import DeformConv, ModulatedDeformConv
+from .mask_ops import paste_masks_in_image
+from .nms import batched_nms, batched_nms_rotated, nms, nms_rotated
+from .roi_align import ROIAlign, roi_align
+from .roi_align_rotated import ROIAlignRotated, roi_align_rotated
+from .shape_spec import ShapeSpec
+from .wrappers import (
+    BatchNorm2d,
+    Conv2d,
+    ConvTranspose2d,
+    cat,
+    interpolate,
+    Linear,
+    nonzero_tuple,
+    cross_entropy,
+    empty_input_loss_func_wrapper,
+    shapes_to_tensor,
+    move_device_like,
+)
+from .blocks import CNNBlockBase, DepthwiseSeparableConv2d
+from .aspp import ASPP
+from .losses import ciou_loss, diou_loss
+
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
--- a/detectron2/layers/aspp.py
+++ b/detectron2/layers/aspp.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+from copy import deepcopy
+import fvcore.nn.weight_init as weight_init
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from .batch_norm import get_norm
+from .blocks import DepthwiseSeparableConv2d
+from .wrappers import Conv2d
+
+
+class ASPP(nn.Module):
+    """
+    Atrous Spatial Pyramid Pooling (ASPP).
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        dilations,
+        *,
+        norm,
+        activation,
+        pool_kernel_size=None,
+        dropout: float = 0.0,
+        use_depthwise_separable_conv=False,
+    ):
+        """
+        Args:
+            in_channels (int): number of input channels for ASPP.
+            out_channels (int): number of output channels.
+            dilations (list): a list of 3 dilations in ASPP.
+            norm (str or callable): normalization for all conv layers.
+                See :func:`layers.get_norm` for supported format. norm is
+                applied to all conv layers except the conv following
+                global average pooling.
+            activation (callable): activation function.
+            pool_kernel_size (tuple, list): the average pooling size (kh, kw)
+                for image pooling layer in ASPP. If set to None, it always
+                performs global average pooling. If not None, it must be
+                divisible by the shape of inputs in forward(). It is recommended
+                to use a fixed input feature size in training, and set this
+                option to match this size, so that it performs global average
+                pooling in training, and the size of the pooling window stays
+                consistent in inference.
+            dropout (float): apply dropout on the output of ASPP. It is used in
+                the official DeepLab implementation with a rate of 0.1:
+                https://github.com/tensorflow/models/blob/21b73d22f3ed05b650e85ac50849408dd36de32e/research/deeplab/model.py#L532  # noqa
+            use_depthwise_separable_conv (bool): use DepthwiseSeparableConv2d
+                for 3x3 convs in ASPP, proposed in :paper:`DeepLabV3+`.
+        """
+        super(ASPP, self).__init__()
+        assert len(dilations) == 3, "ASPP expects 3 dilations, got {}".format(len(dilations))
+        self.pool_kernel_size = pool_kernel_size
+        self.dropout = dropout
+        use_bias = norm == ""
+        self.convs = nn.ModuleList()
+        # conv 1x1
+        self.convs.append(
+            Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                bias=use_bias,
+                norm=get_norm(norm, out_channels),
+                activation=deepcopy(activation),
+            )
+        )
+        weight_init.c2_xavier_fill(self.convs[-1])
+        # atrous convs
+        for dilation in dilations:
+            if use_depthwise_separable_conv:
+                self.convs.append(
+                    DepthwiseSeparableConv2d(
+                        in_channels,
+                        out_channels,
+                        kernel_size=3,
+                        padding=dilation,
+                        dilation=dilation,
+                        norm1=norm,
+                        activation1=deepcopy(activation),
+                        norm2=norm,
+                        activation2=deepcopy(activation),
+                    )
+                )
+            else:
+                self.convs.append(
+                    Conv2d(
+                        in_channels,
+                        out_channels,
+                        kernel_size=3,
+                        padding=dilation,
+                        dilation=dilation,
+                        bias=use_bias,
+                        norm=get_norm(norm, out_channels),
+                        activation=deepcopy(activation),
+                    )
+                )
+                weight_init.c2_xavier_fill(self.convs[-1])
+        # image pooling
+        # We do not add BatchNorm because the spatial resolution is 1x1,
+        # the original TF implementation has BatchNorm.
+        if pool_kernel_size is None:
+            image_pooling = nn.Sequential(
+                nn.AdaptiveAvgPool2d(1),
+                Conv2d(in_channels, out_channels, 1, bias=True, activation=deepcopy(activation)),
+            )
+        else:
+            image_pooling = nn.Sequential(
+                nn.AvgPool2d(kernel_size=pool_kernel_size, stride=1),
+                Conv2d(in_channels, out_channels, 1, bias=True, activation=deepcopy(activation)),
+            )
+        weight_init.c2_xavier_fill(image_pooling[1])
+        self.convs.append(image_pooling)
+
+        self.project = Conv2d(
+            5 * out_channels,
+            out_channels,
+            kernel_size=1,
+            bias=use_bias,
+            norm=get_norm(norm, out_channels),
+            activation=deepcopy(activation),
+        )
+        weight_init.c2_xavier_fill(self.project)
+
+    def forward(self, x):
+        size = x.shape[-2:]
+        if self.pool_kernel_size is not None:
+            if size[0] % self.pool_kernel_size[0] or size[1] % self.pool_kernel_size[1]:
+                raise ValueError(
+                    "`pool_kernel_size` must be divisible by the shape of inputs. "
+                    "Input size: {} `pool_kernel_size`: {}".format(size, self.pool_kernel_size)
+                )
+        res = []
+        for conv in self.convs:
+            res.append(conv(x))
+        res[-1] = F.interpolate(res[-1], size=size, mode="bilinear", align_corners=False)
+        res = torch.cat(res, dim=1)
+        res = self.project(res)
+        res = F.dropout(res, self.dropout, training=self.training) if self.dropout > 0 else res
+        return res
--- a/detectron2/layers/batch_norm.py
+++ b/detectron2/layers/batch_norm.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+import torch
+import torch.distributed as dist
+from fvcore.nn.distributed import differentiable_all_reduce
+from torch import nn
+from torch.nn import functional as F
+
+from detectron2.utils import comm, env
+
+from .wrappers import BatchNorm2d
+
+
+class FrozenBatchNorm2d(nn.Module):
+    """
+    BatchNorm2d where the batch statistics and the affine parameters are fixed.
+
+    It contains non-trainable buffers called
+    "weight" and "bias", "running_mean", "running_var",
+    initialized to perform identity transformation.
+
+    The pre-trained backbone models from Caffe2 only contain "weight" and "bias",
+    which are computed from the original four parameters of BN.
+    The affine transform `x * weight + bias` will perform the equivalent
+    computation of `(x - running_mean) / sqrt(running_var) * weight + bias`.
+    When loading a backbone model from Caffe2, "running_mean" and "running_var"
+    will be left unchanged as identity transformation.
+
+    Other pre-trained backbone models may contain all 4 parameters.
+
+    The forward is implemented by `F.batch_norm(..., training=False)`.
+    """
+
+    _version = 3
+
+    def __init__(self, num_features, eps=1e-5):
+        super().__init__()
+        self.num_features = num_features
+        self.eps = eps
+        self.register_buffer("weight", torch.ones(num_features))
+        self.register_buffer("bias", torch.zeros(num_features))
+        self.register_buffer("running_mean", torch.zeros(num_features))
+        self.register_buffer("running_var", torch.ones(num_features) - eps)
+        self.register_buffer("num_batches_tracked", None)
+
+    def forward(self, x):
+        if x.requires_grad:
+            # When gradients are needed, F.batch_norm will use extra memory
+            # because its backward op computes gradients for weight/bias as well.
+            scale = self.weight * (self.running_var + self.eps).rsqrt()
+            bias = self.bias - self.running_mean * scale
+            scale = scale.reshape(1, -1, 1, 1)
+            bias = bias.reshape(1, -1, 1, 1)
+            out_dtype = x.dtype  # may be half
+            return x * scale.to(out_dtype) + bias.to(out_dtype)
+        else:
+            # When gradients are not needed, F.batch_norm is a single fused op
+            # and provide more optimization opportunities.
+            return F.batch_norm(
+                x,
+                self.running_mean,
+                self.running_var,
+                self.weight,
+                self.bias,
+                training=False,
+                eps=self.eps,
+            )
+
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        version = local_metadata.get("version", None)
+
+        if version is None or version < 2:
+            # No running_mean/var in early versions
+            # This will silent the warnings
+            if prefix + "running_mean" not in state_dict:
+                state_dict[prefix + "running_mean"] = torch.zeros_like(self.running_mean)
+            if prefix + "running_var" not in state_dict:
+                state_dict[prefix + "running_var"] = torch.ones_like(self.running_var)
+
+        super()._load_from_state_dict(
+            state_dict,
+            prefix,
+            local_metadata,
+            strict,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )
+
+    def __repr__(self):
+        return "FrozenBatchNorm2d(num_features={}, eps={})".format(self.num_features, self.eps)
+
+    @classmethod
+    def convert_frozen_batchnorm(cls, module):
+        """
+        Convert all BatchNorm/SyncBatchNorm in module into FrozenBatchNorm.
+
+        Args:
+            module (torch.nn.Module):
+
+        Returns:
+            If module is BatchNorm/SyncBatchNorm, returns a new module.
+            Otherwise, in-place convert module and return it.
+
+        Similar to convert_sync_batchnorm in
+        https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/batchnorm.py
+        """
+        bn_module = nn.modules.batchnorm
+        bn_module = (bn_module.BatchNorm2d, bn_module.SyncBatchNorm)
+        res = module
+        if isinstance(module, bn_module):
+            res = cls(module.num_features)
+            if module.affine:
+                res.weight.data = module.weight.data.clone().detach()
+                res.bias.data = module.bias.data.clone().detach()
+            res.running_mean.data = module.running_mean.data
+            res.running_var.data = module.running_var.data
+            res.eps = module.eps
+            res.num_batches_tracked = module.num_batches_tracked
+        else:
+            for name, child in module.named_children():
+                new_child = cls.convert_frozen_batchnorm(child)
+                if new_child is not child:
+                    res.add_module(name, new_child)
+        return res
+
+    @classmethod
+    def convert_frozenbatchnorm2d_to_batchnorm2d(cls, module: nn.Module) -> nn.Module:
+        """
+        Convert all FrozenBatchNorm2d to BatchNorm2d
+
+        Args:
+            module (torch.nn.Module):
+
+        Returns:
+            If module is FrozenBatchNorm2d, returns a new module.
+            Otherwise, in-place convert module and return it.
+
+        This is needed for quantization:
+            https://fb.workplace.com/groups/1043663463248667/permalink/1296330057982005/
+        """
+
+        res = module
+        if isinstance(module, FrozenBatchNorm2d):
+            res = torch.nn.BatchNorm2d(module.num_features, module.eps)
+
+            res.weight.data = module.weight.data.clone().detach()
+            res.bias.data = module.bias.data.clone().detach()
+            res.running_mean.data = module.running_mean.data.clone().detach()
+            res.running_var.data = module.running_var.data.clone().detach()
+            res.eps = module.eps
+            res.num_batches_tracked = module.num_batches_tracked
+        else:
+            for name, child in module.named_children():
+                new_child = cls.convert_frozenbatchnorm2d_to_batchnorm2d(child)
+                if new_child is not child:
+                    res.add_module(name, new_child)
+        return res
+
+
+def get_norm(norm, out_channels):
+    """
+    Args:
+        norm (str or callable): either one of BN, SyncBN, FrozenBN, GN;
+            or a callable that takes a channel number and returns
+            the normalization layer as a nn.Module.
+
+    Returns:
+        nn.Module or None: the normalization layer
+    """
+    if norm is None:
+        return None
+    if isinstance(norm, str):
+        if len(norm) == 0:
+            return None
+        norm = {
+            "BN": BatchNorm2d,
+            # Fixed in https://github.com/pytorch/pytorch/pull/36382
+            "SyncBN": NaiveSyncBatchNorm if env.TORCH_VERSION <= (1, 5) else nn.SyncBatchNorm,
+            "FrozenBN": FrozenBatchNorm2d,
+            "GN": lambda channels: nn.GroupNorm(32, channels),
+            # for debugging:
+            "nnSyncBN": nn.SyncBatchNorm,
+            "naiveSyncBN": NaiveSyncBatchNorm,
+            # expose stats_mode N as an option to caller, required for zero-len inputs
+            "naiveSyncBN_N": lambda channels: NaiveSyncBatchNorm(channels, stats_mode="N"),
+            "LN": lambda channels: LayerNorm(channels),
+        }[norm]
+    return norm(out_channels)
+
+
+class NaiveSyncBatchNorm(BatchNorm2d):
+    """
+    In PyTorch<=1.5, ``nn.SyncBatchNorm`` has incorrect gradient
+    when the batch size on each worker is different.
+    (e.g., when scale augmentation is used, or when it is applied to mask head).
+
+    This is a slower but correct alternative to `nn.SyncBatchNorm`.
+
+    Note:
+        There isn't a single definition of Sync BatchNorm.
+
+        When ``stats_mode==""``, this module computes overall statistics by using
+        statistics of each worker with equal weight.  The result is true statistics
+        of all samples (as if they are all on one worker) only when all workers
+        have the same (N, H, W). This mode does not support inputs with zero batch size.
+
+        When ``stats_mode=="N"``, this module computes overall statistics by weighting
+        the statistics of each worker by their ``N``. The result is true statistics
+        of all samples (as if they are all on one worker) only when all workers
+        have the same (H, W). It is slower than ``stats_mode==""``.
+
+        Even though the result of this module may not be the true statistics of all samples,
+        it may still be reasonable because it might be preferrable to assign equal weights
+        to all workers, regardless of their (H, W) dimension, instead of putting larger weight
+        on larger images. From preliminary experiments, little difference is found between such
+        a simplified implementation and an accurate computation of overall mean & variance.
+    """
+
+    def __init__(self, *args, stats_mode="", **kwargs):
+        super().__init__(*args, **kwargs)
+        assert stats_mode in ["", "N"]
+        self._stats_mode = stats_mode
+
+    def forward(self, input):
+        if comm.get_world_size() == 1 or not self.training:
+            return super().forward(input)
+
+        B, C = input.shape[0], input.shape[1]
+
+        half_input = input.dtype == torch.float16
+        if half_input:
+            # fp16 does not have good enough numerics for the reduction here
+            input = input.float()
+        mean = torch.mean(input, dim=[0, 2, 3])
+        meansqr = torch.mean(input * input, dim=[0, 2, 3])
+
+        if self._stats_mode == "":
+            assert B > 0, 'SyncBatchNorm(stats_mode="") does not support zero batch size.'
+            vec = torch.cat([mean, meansqr], dim=0)
+            vec = differentiable_all_reduce(vec) * (1.0 / dist.get_world_size())
+            mean, meansqr = torch.split(vec, C)
+            momentum = self.momentum
+        else:
+            if B == 0:
+                vec = torch.zeros([2 * C + 1], device=mean.device, dtype=mean.dtype)
+                vec = vec + input.sum()  # make sure there is gradient w.r.t input
+            else:
+                vec = torch.cat(
+                    [
+                        mean,
+                        meansqr,
+                        torch.ones([1], device=mean.device, dtype=mean.dtype),
+                    ],
+                    dim=0,
+                )
+            vec = differentiable_all_reduce(vec * B)
+
+            total_batch = vec[-1].detach()
+            momentum = total_batch.clamp(max=1) * self.momentum  # no update if total_batch is 0
+            mean, meansqr, _ = torch.split(vec / total_batch.clamp(min=1), C)  # avoid div-by-zero
+
+        var = meansqr - mean * mean
+        invstd = torch.rsqrt(var + self.eps)
+        scale = self.weight * invstd
+        bias = self.bias - mean * scale
+        scale = scale.reshape(1, -1, 1, 1)
+        bias = bias.reshape(1, -1, 1, 1)
+
+        self.running_mean += momentum * (mean.detach() - self.running_mean)
+        self.running_var += momentum * (var.detach() - self.running_var)
+        ret = input * scale + bias
+        if half_input:
+            ret = ret.half()
+        return ret
+
+
+class CycleBatchNormList(nn.ModuleList):
+    """
+    Implement domain-specific BatchNorm by cycling.
+
+    When a BatchNorm layer is used for multiple input domains or input
+    features, it might need to maintain a separate test-time statistics
+    for each domain. See Sec 5.2 in :paper:`rethinking-batchnorm`.
+
+    This module implements it by using N separate BN layers
+    and it cycles through them every time a forward() is called.
+
+    NOTE: The caller of this module MUST guarantee to always call
+    this module by multiple of N times. Otherwise its test-time statistics
+    will be incorrect.
+    """
+
+    def __init__(self, length: int, bn_class=nn.BatchNorm2d, **kwargs):
+        """
+        Args:
+            length: number of BatchNorm layers to cycle.
+            bn_class: the BatchNorm class to use
+            kwargs: arguments of the BatchNorm class, such as num_features.
+        """
+        self._affine = kwargs.pop("affine", True)
+        super().__init__([bn_class(**kwargs, affine=False) for k in range(length)])
+        if self._affine:
+            # shared affine, domain-specific BN
+            channels = self[0].num_features
+            self.weight = nn.Parameter(torch.ones(channels))
+            self.bias = nn.Parameter(torch.zeros(channels))
+        self._pos = 0
+
+    def forward(self, x):
+        ret = self[self._pos](x)
+        self._pos = (self._pos + 1) % len(self)
+
+        if self._affine:
+            w = self.weight.reshape(1, -1, 1, 1)
+            b = self.bias.reshape(1, -1, 1, 1)
+            return ret * w + b
+        else:
+            return ret
+
+    def extra_repr(self):
+        return f"affine={self._affine}"
+
+
+class LayerNorm(nn.Module):
+    """
+    A LayerNorm variant, popularized by Transformers, that performs point-wise mean and
+    variance normalization over the channel dimension for inputs that have shape
+    (batch_size, channels, height, width).
+    https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119  # noqa B950
+    """
+
+    def __init__(self, normalized_shape, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.bias = nn.Parameter(torch.zeros(normalized_shape))
+        self.eps = eps
+        self.normalized_shape = (normalized_shape,)
+
+    def forward(self, x):
+        u = x.mean(1, keepdim=True)
+        s = (x - u).pow(2).mean(1, keepdim=True)
+        x = (x - u) / torch.sqrt(s + self.eps)
+        x = self.weight[:, None, None] * x + self.bias[:, None, None]
+        return x
--- a/detectron2/layers/blocks.py
+++ b/detectron2/layers/blocks.py
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import fvcore.nn.weight_init as weight_init
+from torch import nn
+
+from .batch_norm import FrozenBatchNorm2d, get_norm
+from .wrappers import Conv2d
+
+
+"""
+CNN building blocks.
+"""
+
+
+class CNNBlockBase(nn.Module):
+    """
+    A CNN block is assumed to have input channels, output channels and a stride.
+    The input and output of `forward()` method must be NCHW tensors.
+    The method can perform arbitrary computation but must match the given
+    channels and stride specification.
+
+    Attribute:
+        in_channels (int):
+        out_channels (int):
+        stride (int):
+    """
+
+    def __init__(self, in_channels, out_channels, stride):
+        """
+        The `__init__` method of any subclass should also contain these arguments.
+
+        Args:
+            in_channels (int):
+            out_channels (int):
+            stride (int):
+        """
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.stride = stride
+
+    def freeze(self):
+        """
+        Make this block not trainable.
+        This method sets all parameters to `requires_grad=False`,
+        and convert all BatchNorm layers to FrozenBatchNorm
+
+        Returns:
+            the block itself
+        """
+        for p in self.parameters():
+            p.requires_grad = False
+        FrozenBatchNorm2d.convert_frozen_batchnorm(self)
+        return self
+
+
+class DepthwiseSeparableConv2d(nn.Module):
+    """
+    A kxk depthwise convolution + a 1x1 convolution.
+
+    In :paper:`xception`, norm & activation are applied on the second conv.
+    :paper:`mobilenet` uses norm & activation on both convs.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size=3,
+        padding=1,
+        dilation=1,
+        *,
+        norm1=None,
+        activation1=None,
+        norm2=None,
+        activation2=None,
+    ):
+        """
+        Args:
+            norm1, norm2 (str or callable): normalization for the two conv layers.
+            activation1, activation2 (callable(Tensor) -> Tensor): activation
+                function for the two conv layers.
+        """
+        super().__init__()
+        self.depthwise = Conv2d(
+            in_channels,
+            in_channels,
+            kernel_size=kernel_size,
+            padding=padding,
+            dilation=dilation,
+            groups=in_channels,
+            bias=not norm1,
+            norm=get_norm(norm1, in_channels),
+            activation=activation1,
+        )
+        self.pointwise = Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=1,
+            bias=not norm2,
+            norm=get_norm(norm2, out_channels),
+            activation=activation2,
+        )
+
+        # default initialization
+        weight_init.c2_msra_fill(self.depthwise)
+        weight_init.c2_msra_fill(self.pointwise)
+
+    def forward(self, x):
+        return self.pointwise(self.depthwise(x))
--- a/detectron2/layers/csrc/README.md
+++ b/detectron2/layers/csrc/README.md
+
+
+To add a new Op:
+
+1. Create a new directory
+2. Implement new ops there
+3. Delcare its Python interface in `vision.cpp`.
--- a/detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated.h
+++ b/detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated.h
+// Copyright (c) Facebook, Inc. and its affiliates.
+#pragma once
+#include <torch/types.h>
+
+namespace detectron2 {
+
+at::Tensor ROIAlignRotated_forward_cpu(
+    const at::Tensor& input,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio);
+
+at::Tensor ROIAlignRotated_backward_cpu(
+    const at::Tensor& grad,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int batch_size,
+    const int channels,
+    const int height,
+    const int width,
+    const int sampling_ratio);
+
+#if defined(WITH_CUDA) || defined(WITH_HIP)
+at::Tensor ROIAlignRotated_forward_cuda(
+    const at::Tensor& input,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio);
+
+at::Tensor ROIAlignRotated_backward_cuda(
+    const at::Tensor& grad,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int batch_size,
+    const int channels,
+    const int height,
+    const int width,
+    const int sampling_ratio);
+#endif
+
+// Interface for Python
+inline at::Tensor ROIAlignRotated_forward(
+    const at::Tensor& input,
+    const at::Tensor& rois,
+    const double spatial_scale,
+    const int64_t pooled_height,
+    const int64_t pooled_width,
+    const int64_t sampling_ratio) {
+  if (input.is_cuda()) {
+#if defined(WITH_CUDA) || defined(WITH_HIP)
+    return ROIAlignRotated_forward_cuda(
+        input,
+        rois,
+        spatial_scale,
+        pooled_height,
+        pooled_width,
+        sampling_ratio);
+#else
+    AT_ERROR("Detectron2 is not compiled with GPU support!");
+#endif
+  }
+  return ROIAlignRotated_forward_cpu(
+      input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio);
+}
+
+inline at::Tensor ROIAlignRotated_backward(
+    const at::Tensor& grad,
+    const at::Tensor& rois,
+    const double spatial_scale,
+    const int64_t pooled_height,
+    const int64_t pooled_width,
+    const int64_t batch_size,
+    const int64_t channels,
+    const int64_t height,
+    const int64_t width,
+    const int64_t sampling_ratio) {
+  if (grad.is_cuda()) {
+#if defined(WITH_CUDA) || defined(WITH_HIP)
+    return ROIAlignRotated_backward_cuda(
+        grad,
+        rois,
+        spatial_scale,
+        pooled_height,
+        pooled_width,
+        batch_size,
+        channels,
+        height,
+        width,
+        sampling_ratio);
+#else
+    AT_ERROR("Detectron2 is not compiled with GPU support!");
+#endif
+  }
+  return ROIAlignRotated_backward_cpu(
+      grad,
+      rois,
+      spatial_scale,
+      pooled_height,
+      pooled_width,
+      batch_size,
+      channels,
+      height,
+      width,
+      sampling_ratio);
+}
+
+} // namespace detectron2
--- a/detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated_cpu.cpp
+++ b/detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated_cpu.cpp
+// Copyright (c) Facebook, Inc. and its affiliates.
+#include <ATen/TensorUtils.h>
+#include "ROIAlignRotated.h"
+
+// Note: this implementation originates from the Caffe2 ROIAlignRotated Op
+// and PyTorch ROIAlign (non-rotated) Op implementations.
+// The key difference between this implementation and those ones is
+// we don't do "legacy offset" in this version, as there aren't many previous
+// works, if any, using the "legacy" ROIAlignRotated Op.
+// This would make the interface a bit cleaner.
+
+namespace detectron2 {
+
+namespace {
+template <typename T>
+struct PreCalc {
+  int pos1;
+  int pos2;
+  int pos3;
+  int pos4;
+  T w1;
+  T w2;
+  T w3;
+  T w4;
+};
+
+template <typename T>
+void pre_calc_for_bilinear_interpolate(
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int iy_upper,
+    const int ix_upper,
+    T roi_start_h,
+    T roi_start_w,
+    T bin_size_h,
+    T bin_size_w,
+    int roi_bin_grid_h,
+    int roi_bin_grid_w,
+    T roi_center_h,
+    T roi_center_w,
+    T cos_theta,
+    T sin_theta,
+    std::vector<PreCalc<T>>& pre_calc) {
+  int pre_calc_index = 0;
+  for (int ph = 0; ph < pooled_height; ph++) {
+    for (int pw = 0; pw < pooled_width; pw++) {
+      for (int iy = 0; iy < iy_upper; iy++) {
+        const T yy = roi_start_h + ph * bin_size_h +
+            static_cast<T>(iy + .5f) * bin_size_h /
+                static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+        for (int ix = 0; ix < ix_upper; ix++) {
+          const T xx = roi_start_w + pw * bin_size_w +
+              static_cast<T>(ix + .5f) * bin_size_w /
+                  static_cast<T>(roi_bin_grid_w);
+
+          // Rotate by theta around the center and translate
+          // In image space, (y, x) is the order for Right Handed System,
+          // and this is essentially multiplying the point by a rotation matrix
+          // to rotate it counterclockwise through angle theta.
+          T y = yy * cos_theta - xx * sin_theta + roi_center_h;
+          T x = yy * sin_theta + xx * cos_theta + roi_center_w;
+          // deal with: inverse elements are out of feature map boundary
+          if (y < -1.0 || y > height || x < -1.0 || x > width) {
+            // empty
+            PreCalc<T> pc;
+            pc.pos1 = 0;
+            pc.pos2 = 0;
+            pc.pos3 = 0;
+            pc.pos4 = 0;
+            pc.w1 = 0;
+            pc.w2 = 0;
+            pc.w3 = 0;
+            pc.w4 = 0;
+            pre_calc[pre_calc_index] = pc;
+            pre_calc_index += 1;
+            continue;
+          }
+
+          if (y < 0) {
+            y = 0;
+          }
+          if (x < 0) {
+            x = 0;
+          }
+
+          int y_low = (int)y;
+          int x_low = (int)x;
+          int y_high;
+          int x_high;
+
+          if (y_low >= height - 1) {
+            y_high = y_low = height - 1;
+            y = (T)y_low;
+          } else {
+            y_high = y_low + 1;
+          }
+
+          if (x_low >= width - 1) {
+            x_high = x_low = width - 1;
+            x = (T)x_low;
+          } else {
+            x_high = x_low + 1;
+          }
+
+          T ly = y - y_low;
+          T lx = x - x_low;
+          T hy = 1. - ly, hx = 1. - lx;
+          T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+          // save weights and indices
+          PreCalc<T> pc;
+          pc.pos1 = y_low * width + x_low;
+          pc.pos2 = y_low * width + x_high;
+          pc.pos3 = y_high * width + x_low;
+          pc.pos4 = y_high * width + x_high;
+          pc.w1 = w1;
+          pc.w2 = w2;
+          pc.w3 = w3;
+          pc.w4 = w4;
+          pre_calc[pre_calc_index] = pc;
+
+          pre_calc_index += 1;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void bilinear_interpolate_gradient(
+    const int height,
+    const int width,
+    T y,
+    T x,
+    T& w1,
+    T& w2,
+    T& w3,
+    T& w4,
+    int& x_low,
+    int& x_high,
+    int& y_low,
+    int& y_high) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    w1 = w2 = w3 = w4 = 0.;
+    x_low = x_high = y_low = y_high = -1;
+    return;
+  }
+
+  if (y < 0) {
+    y = 0;
+  }
+
+  if (x < 0) {
+    x = 0;
+  }
+
+  y_low = (int)y;
+  x_low = (int)x;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  // reference in forward
+  // T v1 = input[y_low * width + x_low];
+  // T v2 = input[y_low * width + x_high];
+  // T v3 = input[y_high * width + x_low];
+  // T v4 = input[y_high * width + x_high];
+  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  return;
+}
+
+template <class T>
+inline void add(T* address, const T& val) {
+  *address += val;
+}
+
+} // namespace
+
+template <typename T>
+void ROIAlignRotatedForward(
+    const int nthreads,
+    const T* input,
+    const T& spatial_scale,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    const T* rois,
+    T* output) {
+  int n_rois = nthreads / channels / pooled_width / pooled_height;
+  // (n, c, ph, pw) is an element in the pooled output
+  // can be parallelized using omp
+  // #pragma omp parallel for num_threads(32)
+  for (int n = 0; n < n_rois; n++) {
+    int index_n = n * channels * pooled_width * pooled_height;
+
+    const T* current_roi = rois + n * 6;
+    int roi_batch_ind = current_roi[0];
+
+    // Do not use rounding; this implementation detail is critical
+    // ROIAlignRotated supports align == true, i.e., continuous coordinate
+    // by default, thus the 0.5 offset
+    T offset = (T)0.5;
+    T roi_center_w = current_roi[1] * spatial_scale - offset;
+    T roi_center_h = current_roi[2] * spatial_scale - offset;
+    T roi_width = current_roi[3] * spatial_scale;
+    T roi_height = current_roi[4] * spatial_scale;
+    T theta = current_roi[5] * M_PI / 180.0;
+    T cos_theta = cos(theta);
+    T sin_theta = sin(theta);
+
+    AT_ASSERTM(
+        roi_width >= 0 && roi_height >= 0,
+        "ROIs in ROIAlignRotated do not have non-negative size!");
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height); // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
+
+    // we want to precalculate indices and weights shared by all channels,
+    // this is the key point of optimization
+    std::vector<PreCalc<T>> pre_calc(
+        roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
+
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    T roi_start_h = -roi_height / 2.0;
+    T roi_start_w = -roi_width / 2.0;
+
+    pre_calc_for_bilinear_interpolate(
+        height,
+        width,
+        pooled_height,
+        pooled_width,
+        roi_bin_grid_h,
+        roi_bin_grid_w,
+        roi_start_h,
+        roi_start_w,
+        bin_size_h,
+        bin_size_w,
+        roi_bin_grid_h,
+        roi_bin_grid_w,
+        roi_center_h,
+        roi_center_w,
+        cos_theta,
+        sin_theta,
+        pre_calc);
+
+    for (int c = 0; c < channels; c++) {
+      int index_n_c = index_n + c * pooled_width * pooled_height;
+      const T* offset_input =
+          input + (roi_batch_ind * channels + c) * height * width;
+      int pre_calc_index = 0;
+
+      for (int ph = 0; ph < pooled_height; ph++) {
+        for (int pw = 0; pw < pooled_width; pw++) {
+          int index = index_n_c + ph * pooled_width + pw;
+
+          T output_val = 0.;
+          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+              PreCalc<T> pc = pre_calc[pre_calc_index];
+              output_val += pc.w1 * offset_input[pc.pos1] +
+                  pc.w2 * offset_input[pc.pos2] +
+                  pc.w3 * offset_input[pc.pos3] + pc.w4 * offset_input[pc.pos4];
+
+              pre_calc_index += 1;
+            }
+          }
+          output_val /= count;
+
+          output[index] = output_val;
+        } // for pw
+      } // for ph
+    } // for c
+  } // for n
+}
+
+template <typename T>
+void ROIAlignRotatedBackward(
+    const int nthreads,
+    // may not be contiguous. should index using n_stride, etc
+    const T* grad_output,
+    const T& spatial_scale,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    T* grad_input,
+    const T* rois,
+    const int n_stride,
+    const int c_stride,
+    const int h_stride,
+    const int w_stride) {
+  for (int index = 0; index < nthreads; index++) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* current_roi = rois + n * 6;
+    int roi_batch_ind = current_roi[0];
+
+    // Do not use rounding; this implementation detail is critical
+    // ROIAlignRotated supports align == true, i.e., continuous coordinate
+    // by default, thus the 0.5 offset
+    T offset = (T)0.5;
+    T roi_center_w = current_roi[1] * spatial_scale - offset;
+    T roi_center_h = current_roi[2] * spatial_scale - offset;
+    T roi_width = current_roi[3] * spatial_scale;
+    T roi_height = current_roi[4] * spatial_scale;
+    T theta = current_roi[5] * M_PI / 180.0;
+    T cos_theta = cos(theta);
+    T sin_theta = sin(theta);
+
+    AT_ASSERTM(
+        roi_width >= 0 && roi_height >= 0,
+        "ROIs in ROIAlignRotated do not have non-negative size!");
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    T* offset_grad_input =
+        grad_input + ((roi_batch_ind * channels + c) * height * width);
+
+    int output_offset = n * n_stride + c * c_stride;
+    const T* offset_grad_output = grad_output + output_offset;
+    const T grad_output_this_bin =
+        offset_grad_output[ph * h_stride + pw * w_stride];
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height); // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    T roi_start_h = -roi_height / 2.0;
+    T roi_start_w = -roi_width / 2.0;
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
+
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+      const T yy = roi_start_h + ph * bin_size_h +
+          static_cast<T>(iy + .5f) * bin_size_h /
+              static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T xx = roi_start_w + pw * bin_size_w +
+            static_cast<T>(ix + .5f) * bin_size_w /
+                static_cast<T>(roi_bin_grid_w);
+
+        // Rotate by theta around the center and translate
+        T y = yy * cos_theta - xx * sin_theta + roi_center_h;
+        T x = yy * sin_theta + xx * cos_theta + roi_center_w;
+
+        T w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+
+        bilinear_interpolate_gradient(
+            height, width, y, x, w1, w2, w3, w4, x_low, x_high, y_low, y_high);
+
+        T g1 = grad_output_this_bin * w1 / count;
+        T g2 = grad_output_this_bin * w2 / count;
+        T g3 = grad_output_this_bin * w3 / count;
+        T g4 = grad_output_this_bin * w4 / count;
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          // atomic add is not needed for now since it is single threaded
+          add(offset_grad_input + y_low * width + x_low, static_cast<T>(g1));
+          add(offset_grad_input + y_low * width + x_high, static_cast<T>(g2));
+          add(offset_grad_input + y_high * width + x_low, static_cast<T>(g3));
+          add(offset_grad_input + y_high * width + x_high, static_cast<T>(g4));
+        } // if
+      } // ix
+    } // iy
+  } // for
+} // ROIAlignRotatedBackward
+
+at::Tensor ROIAlignRotated_forward_cpu(
+    const at::Tensor& input,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio) {
+  AT_ASSERTM(input.device().is_cpu(), "input must be a CPU tensor");
+  AT_ASSERTM(rois.device().is_cpu(), "rois must be a CPU tensor");
+
+  at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2};
+
+  at::CheckedFrom c = "ROIAlign_forward_cpu";
+  at::checkAllSameType(c, {input_t, rois_t});
+
+  auto num_rois = rois.size(0);
+  auto channels = input.size(1);
+  auto height = input.size(2);
+  auto width = input.size(3);
+
+  at::Tensor output = at::zeros(
+      {num_rois, channels, pooled_height, pooled_width}, input.options());
+
+  auto output_size = num_rois * pooled_height * pooled_width * channels;
+
+  if (output.numel() == 0) {
+    return output;
+  }
+
+  auto input_ = input.contiguous(), rois_ = rois.contiguous();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "ROIAlignRotated_forward", [&] {
+        ROIAlignRotatedForward<scalar_t>(
+            output_size,
+            input_.data_ptr<scalar_t>(),
+            spatial_scale,
+            channels,
+            height,
+            width,
+            pooled_height,
+            pooled_width,
+            sampling_ratio,
+            rois_.data_ptr<scalar_t>(),
+            output.data_ptr<scalar_t>());
+      });
+  return output;
+}
+
+at::Tensor ROIAlignRotated_backward_cpu(
+    const at::Tensor& grad,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int batch_size,
+    const int channels,
+    const int height,
+    const int width,
+    const int sampling_ratio) {
+  AT_ASSERTM(grad.device().is_cpu(), "grad must be a CPU tensor");
+  AT_ASSERTM(rois.device().is_cpu(), "rois must be a CPU tensor");
+
+  at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2};
+
+  at::CheckedFrom c = "ROIAlignRotated_backward_cpu";
+  at::checkAllSameType(c, {grad_t, rois_t});
+
+  at::Tensor grad_input =
+      at::zeros({batch_size, channels, height, width}, grad.options());
+
+  // handle possibly empty gradients
+  if (grad.numel() == 0) {
+    return grad_input;
+  }
+
+  // get stride values to ensure indexing into gradients is correct.
+  int n_stride = grad.stride(0);
+  int c_stride = grad.stride(1);
+  int h_stride = grad.stride(2);
+  int w_stride = grad.stride(3);
+
+  auto rois_ = rois.contiguous();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad.scalar_type(), "ROIAlignRotated_forward", [&] {
+        ROIAlignRotatedBackward<scalar_t>(
+            grad.numel(),
+            grad.data_ptr<scalar_t>(),
+            spatial_scale,
+            channels,
+            height,
+            width,
+            pooled_height,
+            pooled_width,
+            sampling_ratio,
+            grad_input.data_ptr<scalar_t>(),
+            rois_.data_ptr<scalar_t>(),
+            n_stride,
+            c_stride,
+            h_stride,
+            w_stride);
+      });
+  return grad_input;
+}
+
+} // namespace detectron2
--- a/detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated_cuda.cu
+++ b/detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated_cuda.cu
+// Copyright (c) Facebook, Inc. and its affiliates.
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+
+// TODO make it in a common file
+#define CUDA_1D_KERNEL_LOOP(i, n)                            \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
+       i += blockDim.x * gridDim.x)
+
+// Note: this implementation originates from the Caffe2 ROIAlignRotated Op
+// and PyTorch ROIAlign (non-rotated) Op implementations.
+// The key difference between this implementation and those ones is
+// we don't do "legacy offset" in this version, as there aren't many previous
+// works, if any, using the "legacy" ROIAlignRotated Op.
+// This would make the interface a bit cleaner.
+
+namespace detectron2 {
+
+namespace {
+
+template <typename T>
+__device__ T bilinear_interpolate(
+    const T* input,
+    const int height,
+    const int width,
+    T y,
+    T x) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    return 0;
+  }
+
+  if (y < 0) {
+    y = 0;
+  }
+
+  if (x < 0) {
+    x = 0;
+  }
+
+  int y_low = (int)y;
+  int x_low = (int)x;
+  int y_high;
+  int x_high;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+  // do bilinear interpolation
+  T v1 = input[y_low * width + x_low];
+  T v2 = input[y_low * width + x_high];
+  T v3 = input[y_high * width + x_low];
+  T v4 = input[y_high * width + x_high];
+  T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  return val;
+}
+
+template <typename T>
+__device__ void bilinear_interpolate_gradient(
+    const int height,
+    const int width,
+    T y,
+    T x,
+    T& w1,
+    T& w2,
+    T& w3,
+    T& w4,
+    int& x_low,
+    int& x_high,
+    int& y_low,
+    int& y_high) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    w1 = w2 = w3 = w4 = 0.;
+    x_low = x_high = y_low = y_high = -1;
+    return;
+  }
+
+  if (y < 0) {
+    y = 0;
+  }
+
+  if (x < 0) {
+    x = 0;
+  }
+
+  y_low = (int)y;
+  x_low = (int)x;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  // reference in forward
+  // T v1 = input[y_low * width + x_low];
+  // T v2 = input[y_low * width + x_high];
+  // T v3 = input[y_high * width + x_low];
+  // T v4 = input[y_high * width + x_high];
+  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  return;
+}
+
+} // namespace
+
+template <typename T>
+__global__ void RoIAlignRotatedForward(
+    const int nthreads,
+    const T* input,
+    const T spatial_scale,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    const T* rois,
+    T* top_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* current_roi = rois + n * 6;
+    int roi_batch_ind = current_roi[0];
+
+    // Do not use rounding; this implementation detail is critical
+    // ROIAlignRotated supports align == true, i.e., continuous coordinate
+    // by default, thus the 0.5 offset
+    T offset = (T)0.5;
+    T roi_center_w = current_roi[1] * spatial_scale - offset;
+    T roi_center_h = current_roi[2] * spatial_scale - offset;
+    T roi_width = current_roi[3] * spatial_scale;
+    T roi_height = current_roi[4] * spatial_scale;
+    T theta = current_roi[5] * M_PI / 180.0;
+    T cos_theta = cos(theta);
+    T sin_theta = sin(theta);
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    const T* offset_input =
+        input + (roi_batch_ind * channels + c) * height * width;
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height); // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    T roi_start_h = -roi_height / 2.0;
+    T roi_start_w = -roi_width / 2.0;
+
+    // We do average (inte  gral) pooling inside a bin
+    const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
+
+    T output_val = 0.;
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
+    {
+      const T yy = roi_start_h + ph * bin_size_h +
+          static_cast<T>(iy + .5f) * bin_size_h /
+              static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T xx = roi_start_w + pw * bin_size_w +
+            static_cast<T>(ix + .5f) * bin_size_w /
+                static_cast<T>(roi_bin_grid_w);
+
+        // Rotate by theta around the center and translate
+        T y = yy * cos_theta - xx * sin_theta + roi_center_h;
+        T x = yy * sin_theta + xx * cos_theta + roi_center_w;
+
+        T val = bilinear_interpolate(offset_input, height, width, y, x);
+        output_val += val;
+      }
+    }
+    output_val /= count;
+
+    top_data[index] = output_val;
+  }
+}
+
+template <typename T>
+__global__ void RoIAlignRotatedBackwardFeature(
+    const int nthreads,
+    const T* top_diff,
+    const int num_rois,
+    const T spatial_scale,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    T* bottom_diff,
+    const T* rois) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* current_roi = rois + n * 6;
+    int roi_batch_ind = current_roi[0];
+
+    // Do not use rounding; this implementation detail is critical
+    // ROIAlignRotated supports align == true, i.e., continuous coordinate
+    // by default, thus the 0.5 offset
+    T offset = (T)0.5;
+    T roi_center_w = current_roi[1] * spatial_scale - offset;
+    T roi_center_h = current_roi[2] * spatial_scale - offset;
+    T roi_width = current_roi[3] * spatial_scale;
+    T roi_height = current_roi[4] * spatial_scale;
+    T theta = current_roi[5] * M_PI / 180.0;
+    T cos_theta = cos(theta);
+    T sin_theta = sin(theta);
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    T* offset_bottom_diff =
+        bottom_diff + (roi_batch_ind * channels + c) * height * width;
+
+    int top_offset = (n * channels + c) * pooled_height * pooled_width;
+    const T* offset_top_diff = top_diff + top_offset;
+    const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height); // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    T roi_start_h = -roi_height / 2.0;
+    T roi_start_w = -roi_width / 2.0;
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
+
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
+    {
+      const T yy = roi_start_h + ph * bin_size_h +
+          static_cast<T>(iy + .5f) * bin_size_h /
+              static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T xx = roi_start_w + pw * bin_size_w +
+            static_cast<T>(ix + .5f) * bin_size_w /
+                static_cast<T>(roi_bin_grid_w);
+
+        // Rotate by theta around the center and translate
+        T y = yy * cos_theta - xx * sin_theta + roi_center_h;
+        T x = yy * sin_theta + xx * cos_theta + roi_center_w;
+
+        T w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+
+        bilinear_interpolate_gradient(
+            height, width, y, x, w1, w2, w3, w4, x_low, x_high, y_low, y_high);
+
+        T g1 = top_diff_this_bin * w1 / count;
+        T g2 = top_diff_this_bin * w2 / count;
+        T g3 = top_diff_this_bin * w3 / count;
+        T g4 = top_diff_this_bin * w4 / count;
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          atomicAdd(
+              offset_bottom_diff + y_low * width + x_low, static_cast<T>(g1));
+          atomicAdd(
+              offset_bottom_diff + y_low * width + x_high, static_cast<T>(g2));
+          atomicAdd(
+              offset_bottom_diff + y_high * width + x_low, static_cast<T>(g3));
+          atomicAdd(
+              offset_bottom_diff + y_high * width + x_high, static_cast<T>(g4));
+        } // if
+      } // ix
+    } // iy
+  } // CUDA_1D_KERNEL_LOOP
+} // RoIAlignRotatedBackward
+
+at::Tensor ROIAlignRotated_forward_cuda(
+    const at::Tensor& input,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio) {
+  AT_ASSERTM(input.device().is_cuda(), "input must be a CUDA tensor");
+  AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
+  at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2};
+
+  at::CheckedFrom c = "ROIAlignRotated_forward_cuda";
+  at::checkAllSameGPU(c, {input_t, rois_t});
+  at::checkAllSameType(c, {input_t, rois_t});
+  at::cuda::CUDAGuard device_guard(input.device());
+
+  auto num_rois = rois.size(0);
+  auto channels = input.size(1);
+  auto height = input.size(2);
+  auto width = input.size(3);
+
+  auto output = at::empty(
+      {num_rois, channels, pooled_height, pooled_width}, input.options());
+  auto output_size = num_rois * pooled_height * pooled_width * channels;
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 grid(std::min(
+      at::cuda::ATenCeilDiv(
+          static_cast<int64_t>(output_size), static_cast<int64_t>(512)),
+      static_cast<int64_t>(4096)));
+  dim3 block(512);
+
+  if (output.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return output;
+  }
+
+  auto input_ = input.contiguous(), rois_ = rois.contiguous();
+  AT_DISPATCH_FLOATING_TYPES(
+      input.scalar_type(), "ROIAlignRotated_forward", [&] {
+        RoIAlignRotatedForward<scalar_t><<<grid, block, 0, stream>>>(
+            output_size,
+            input_.data_ptr<scalar_t>(),
+            spatial_scale,
+            channels,
+            height,
+            width,
+            pooled_height,
+            pooled_width,
+            sampling_ratio,
+            rois_.data_ptr<scalar_t>(),
+            output.data_ptr<scalar_t>());
+      });
+  cudaDeviceSynchronize();
+  AT_CUDA_CHECK(cudaGetLastError());
+  return output;
+}
+
+// TODO remove the dependency on input and use instead its sizes -> save memory
+at::Tensor ROIAlignRotated_backward_cuda(
+    const at::Tensor& grad,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int batch_size,
+    const int channels,
+    const int height,
+    const int width,
+    const int sampling_ratio) {
+  AT_ASSERTM(grad.device().is_cuda(), "grad must be a CUDA tensor");
+  AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
+
+  at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2};
+  at::CheckedFrom c = "ROIAlign_backward_cuda";
+  at::checkAllSameGPU(c, {grad_t, rois_t});
+  at::checkAllSameType(c, {grad_t, rois_t});
+  at::cuda::CUDAGuard device_guard(grad.device());
+
+  auto num_rois = rois.size(0);
+  auto grad_input =
+      at::zeros({batch_size, channels, height, width}, grad.options());
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 grid(std::min(
+      at::cuda::ATenCeilDiv(
+          static_cast<int64_t>(grad.numel()), static_cast<int64_t>(512)),
+      static_cast<int64_t>(4096)));
+  dim3 block(512);
+
+  // handle possibly empty gradients
+  if (grad.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return grad_input;
+  }
+
+  auto grad_ = grad.contiguous(), rois_ = rois.contiguous();
+  AT_DISPATCH_FLOATING_TYPES(
+      grad.scalar_type(), "ROIAlignRotated_backward", [&] {
+        RoIAlignRotatedBackwardFeature<scalar_t><<<grid, block, 0, stream>>>(
+            grad.numel(),
+            grad_.data_ptr<scalar_t>(),
+            num_rois,
+            spatial_scale,
+            channels,
+            height,
+            width,
+            pooled_height,
+            pooled_width,
+            sampling_ratio,
+            grad_input.data_ptr<scalar_t>(),
+            rois_.data_ptr<scalar_t>());
+      });
+  AT_CUDA_CHECK(cudaGetLastError());
+  return grad_input;
+}
+
+} // namespace detectron2
--- a/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated.h
+++ b/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated.h
+// Copyright (c) Facebook, Inc. and its affiliates.
+#pragma once
+#include <torch/types.h>
+
+namespace detectron2 {
+
+at::Tensor box_iou_rotated_cpu(
+    const at::Tensor& boxes1,
+    const at::Tensor& boxes2);
+
+#if defined(WITH_CUDA) || defined(WITH_HIP)
+at::Tensor box_iou_rotated_cuda(
+    const at::Tensor& boxes1,
+    const at::Tensor& boxes2);
+#endif
+
+// Interface for Python
+// inline is needed to prevent multiple function definitions when this header is
+// included by different cpps
+inline at::Tensor box_iou_rotated(
+    const at::Tensor& boxes1,
+    const at::Tensor& boxes2) {
+  assert(boxes1.device().is_cuda() == boxes2.device().is_cuda());
+  if (boxes1.device().is_cuda()) {
+#if defined(WITH_CUDA) || defined(WITH_HIP)
+    return box_iou_rotated_cuda(boxes1.contiguous(), boxes2.contiguous());
+#else
+    AT_ERROR("Detectron2 is not compiled with GPU support!");
+#endif
+  }
+
+  return box_iou_rotated_cpu(boxes1.contiguous(), boxes2.contiguous());
+}
+
+} // namespace detectron2
--- a/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cpu.cpp
+++ b/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cpu.cpp
+// Copyright (c) Facebook, Inc. and its affiliates.
+#include "box_iou_rotated.h"
+#include "box_iou_rotated_utils.h"
+
+namespace detectron2 {
+
+template <typename T>
+void box_iou_rotated_cpu_kernel(
+    const at::Tensor& boxes1,
+    const at::Tensor& boxes2,
+    at::Tensor& ious) {
+  auto num_boxes1 = boxes1.size(0);
+  auto num_boxes2 = boxes2.size(0);
+
+  for (int i = 0; i < num_boxes1; i++) {
+    for (int j = 0; j < num_boxes2; j++) {
+      ious[i * num_boxes2 + j] = single_box_iou_rotated<T>(
+          boxes1[i].data_ptr<T>(), boxes2[j].data_ptr<T>());
+    }
+  }
+}
+
+at::Tensor box_iou_rotated_cpu(
+    // input must be contiguous:
+    const at::Tensor& boxes1,
+    const at::Tensor& boxes2) {
+  auto num_boxes1 = boxes1.size(0);
+  auto num_boxes2 = boxes2.size(0);
+  at::Tensor ious =
+      at::empty({num_boxes1 * num_boxes2}, boxes1.options().dtype(at::kFloat));
+
+  box_iou_rotated_cpu_kernel<float>(boxes1, boxes2, ious);
+
+  // reshape from 1d array to 2d array
+  auto shape = std::vector<int64_t>{num_boxes1, num_boxes2};
+  return ious.reshape(shape);
+}
+
+} // namespace detectron2
--- a/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cuda.cu
+++ b/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cuda.cu
+// Copyright (c) Facebook, Inc. and its affiliates.
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+#include "box_iou_rotated_utils.h"
+
+namespace detectron2 {
+
+// 2D block with 32 * 16 = 512 threads per block
+const int BLOCK_DIM_X = 32;
+const int BLOCK_DIM_Y = 16;
+
+template <typename T>
+__global__ void box_iou_rotated_cuda_kernel(
+    const int n_boxes1,
+    const int n_boxes2,
+    const T* dev_boxes1,
+    const T* dev_boxes2,
+    T* dev_ious) {
+  const int row_start = blockIdx.x * blockDim.x;
+  const int col_start = blockIdx.y * blockDim.y;
+
+  const int row_size = min(n_boxes1 - row_start, blockDim.x);
+  const int col_size = min(n_boxes2 - col_start, blockDim.y);
+
+  __shared__ float block_boxes1[BLOCK_DIM_X * 5];
+  __shared__ float block_boxes2[BLOCK_DIM_Y * 5];
+
+  // It's safe to copy using threadIdx.x since BLOCK_DIM_X >= BLOCK_DIM_Y
+  if (threadIdx.x < row_size && threadIdx.y == 0) {
+    block_boxes1[threadIdx.x * 5 + 0] =
+        dev_boxes1[(row_start + threadIdx.x) * 5 + 0];
+    block_boxes1[threadIdx.x * 5 + 1] =
+        dev_boxes1[(row_start + threadIdx.x) * 5 + 1];
+    block_boxes1[threadIdx.x * 5 + 2] =
+        dev_boxes1[(row_start + threadIdx.x) * 5 + 2];
+    block_boxes1[threadIdx.x * 5 + 3] =
+        dev_boxes1[(row_start + threadIdx.x) * 5 + 3];
+    block_boxes1[threadIdx.x * 5 + 4] =
+        dev_boxes1[(row_start + threadIdx.x) * 5 + 4];
+  }
+
+  if (threadIdx.x < col_size && threadIdx.y == 0) {
+    block_boxes2[threadIdx.x * 5 + 0] =
+        dev_boxes2[(col_start + threadIdx.x) * 5 + 0];
+    block_boxes2[threadIdx.x * 5 + 1] =
+        dev_boxes2[(col_start + threadIdx.x) * 5 + 1];
+    block_boxes2[threadIdx.x * 5 + 2] =
+        dev_boxes2[(col_start + threadIdx.x) * 5 + 2];
+    block_boxes2[threadIdx.x * 5 + 3] =
+        dev_boxes2[(col_start + threadIdx.x) * 5 + 3];
+    block_boxes2[threadIdx.x * 5 + 4] =
+        dev_boxes2[(col_start + threadIdx.x) * 5 + 4];
+  }
+  __syncthreads();
+
+  if (threadIdx.x < row_size && threadIdx.y < col_size) {
+    int offset = (row_start + threadIdx.x) * n_boxes2 + col_start + threadIdx.y;
+    dev_ious[offset] = single_box_iou_rotated<T>(
+        block_boxes1 + threadIdx.x * 5, block_boxes2 + threadIdx.y * 5);
+  }
+}
+
+at::Tensor box_iou_rotated_cuda(
+    // input must be contiguous
+    const at::Tensor& boxes1,
+    const at::Tensor& boxes2) {
+  using scalar_t = float;
+  AT_ASSERTM(
+      boxes1.scalar_type() == at::kFloat, "boxes1 must be a float tensor");
+  AT_ASSERTM(
+      boxes2.scalar_type() == at::kFloat, "boxes2 must be a float tensor");
+  AT_ASSERTM(boxes1.is_cuda(), "boxes1 must be a CUDA tensor");
+  AT_ASSERTM(boxes2.is_cuda(), "boxes2 must be a CUDA tensor");
+  at::cuda::CUDAGuard device_guard(boxes1.device());
+
+  auto num_boxes1 = boxes1.size(0);
+  auto num_boxes2 = boxes2.size(0);
+
+  at::Tensor ious =
+      at::empty({num_boxes1 * num_boxes2}, boxes1.options().dtype(at::kFloat));
+
+  bool transpose = false;
+  if (num_boxes1 > 0 && num_boxes2 > 0) {
+    scalar_t *data1 = boxes1.data_ptr<scalar_t>(),
+             *data2 = boxes2.data_ptr<scalar_t>();
+
+    if (num_boxes2 > 65535 * BLOCK_DIM_Y) {
+      AT_ASSERTM(
+          num_boxes1 <= 65535 * BLOCK_DIM_Y,
+          "Too many boxes for box_iou_rotated_cuda!");
+      // x dim is allowed to be large, but y dim cannot,
+      // so we transpose the two to avoid "invalid configuration argument"
+      // error. We assume one of them is small. Otherwise the result is hard to
+      // fit in memory anyway.
+      std::swap(num_boxes1, num_boxes2);
+      std::swap(data1, data2);
+      transpose = true;
+    }
+
+    const int blocks_x =
+        at::cuda::ATenCeilDiv(static_cast<int>(num_boxes1), BLOCK_DIM_X);
+    const int blocks_y =
+        at::cuda::ATenCeilDiv(static_cast<int>(num_boxes2), BLOCK_DIM_Y);
+
+    dim3 blocks(blocks_x, blocks_y);
+    dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y);
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+    box_iou_rotated_cuda_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
+        num_boxes1,
+        num_boxes2,
+        data1,
+        data2,
+        (scalar_t*)ious.data_ptr<scalar_t>());
+
+    AT_CUDA_CHECK(cudaGetLastError());
+  }
+
+  // reshape from 1d array to 2d array
+  auto shape = std::vector<int64_t>{num_boxes1, num_boxes2};
+  if (transpose) {
+    return ious.view(shape).t();
+  } else {
+    return ious.view(shape);
+  }
+}
+
+} // namespace detectron2
--- a/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_utils.h
+++ b/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_utils.h
+// Copyright (c) Facebook, Inc. and its affiliates.
+#pragma once
+
+#include <cassert>
+#include <cmath>
+
+#if defined(__CUDACC__) || __HCC__ == 1 || __HIP__ == 1
+// Designates functions callable from the host (CPU) and the device (GPU)
+#define HOST_DEVICE __host__ __device__
+#define HOST_DEVICE_INLINE HOST_DEVICE __forceinline__
+#else
+#include <algorithm>
+#define HOST_DEVICE
+#define HOST_DEVICE_INLINE HOST_DEVICE inline
+#endif
+
+namespace detectron2 {
+
+namespace {
+
+template <typename T>
+struct RotatedBox {
+  T x_ctr, y_ctr, w, h, a;
+};
+
+template <typename T>
+struct Point {
+  T x, y;
+  HOST_DEVICE_INLINE Point(const T& px = 0, const T& py = 0) : x(px), y(py) {}
+  HOST_DEVICE_INLINE Point operator+(const Point& p) const {
+    return Point(x + p.x, y + p.y);
+  }
+  HOST_DEVICE_INLINE Point& operator+=(const Point& p) {
+    x += p.x;
+    y += p.y;
+    return *this;
+  }
+  HOST_DEVICE_INLINE Point operator-(const Point& p) const {
+    return Point(x - p.x, y - p.y);
+  }
+  HOST_DEVICE_INLINE Point operator*(const T coeff) const {
+    return Point(x * coeff, y * coeff);
+  }
+};
+
+template <typename T>
+HOST_DEVICE_INLINE T dot_2d(const Point<T>& A, const Point<T>& B) {
+  return A.x * B.x + A.y * B.y;
+}
+
+// R: result type. can be different from input type
+template <typename T, typename R = T>
+HOST_DEVICE_INLINE R cross_2d(const Point<T>& A, const Point<T>& B) {
+  return static_cast<R>(A.x) * static_cast<R>(B.y) -
+      static_cast<R>(B.x) * static_cast<R>(A.y);
+}
+
+template <typename T>
+HOST_DEVICE_INLINE void get_rotated_vertices(
+    const RotatedBox<T>& box,
+    Point<T> (&pts)[4]) {
+  // M_PI / 180. == 0.01745329251
+  double theta = box.a * 0.01745329251;
+  T cosTheta2 = (T)cos(theta) * 0.5f;
+  T sinTheta2 = (T)sin(theta) * 0.5f;
+
+  // y: top --> down; x: left --> right
+  pts[0].x = box.x_ctr + sinTheta2 * box.h + cosTheta2 * box.w;
+  pts[0].y = box.y_ctr + cosTheta2 * box.h - sinTheta2 * box.w;
+  pts[1].x = box.x_ctr - sinTheta2 * box.h + cosTheta2 * box.w;
+  pts[1].y = box.y_ctr - cosTheta2 * box.h - sinTheta2 * box.w;
+  pts[2].x = 2 * box.x_ctr - pts[0].x;
+  pts[2].y = 2 * box.y_ctr - pts[0].y;
+  pts[3].x = 2 * box.x_ctr - pts[1].x;
+  pts[3].y = 2 * box.y_ctr - pts[1].y;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE int get_intersection_points(
+    const Point<T> (&pts1)[4],
+    const Point<T> (&pts2)[4],
+    Point<T> (&intersections)[24]) {
+  // Line vector
+  // A line from p1 to p2 is: p1 + (p2-p1)*t, t=[0,1]
+  Point<T> vec1[4], vec2[4];
+  for (int i = 0; i < 4; i++) {
+    vec1[i] = pts1[(i + 1) % 4] - pts1[i];
+    vec2[i] = pts2[(i + 1) % 4] - pts2[i];
+  }
+
+  // When computing the intersection area, it doesn't hurt if we have
+  // more (duplicated/approximate) intersections/vertices than needed,
+  // while it can cause drastic difference if we miss an intersection/vertex.
+  // Therefore, we add an epsilon to relax the comparisons between
+  // the float point numbers that decide the intersection points.
+  double EPS = 1e-5;
+
+  // Line test - test all line combos for intersection
+  int num = 0; // number of intersections
+  for (int i = 0; i < 4; i++) {
+    for (int j = 0; j < 4; j++) {
+      // Solve for 2x2 Ax=b
+      T det = cross_2d<T>(vec2[j], vec1[i]);
+
+      // This takes care of parallel lines
+      if (fabs(det) <= 1e-14) {
+        continue;
+      }
+
+      auto vec12 = pts2[j] - pts1[i];
+
+      T t1 = cross_2d<T>(vec2[j], vec12) / det;
+      T t2 = cross_2d<T>(vec1[i], vec12) / det;
+
+      if (t1 > -EPS && t1 < 1.0f + EPS && t2 > -EPS && t2 < 1.0f + EPS) {
+        intersections[num++] = pts1[i] + vec1[i] * t1;
+      }
+    }
+  }
+
+  // Check for vertices of rect1 inside rect2
+  {
+    const auto& AB = vec2[0];
+    const auto& DA = vec2[3];
+    auto ABdotAB = dot_2d<T>(AB, AB);
+    auto ADdotAD = dot_2d<T>(DA, DA);
+    for (int i = 0; i < 4; i++) {
+      // assume ABCD is the rectangle, and P is the point to be judged
+      // P is inside ABCD iff. P's projection on AB lies within AB
+      // and P's projection on AD lies within AD
+
+      auto AP = pts1[i] - pts2[0];
+
+      auto APdotAB = dot_2d<T>(AP, AB);
+      auto APdotAD = -dot_2d<T>(AP, DA);
+
+      if ((APdotAB > -EPS) && (APdotAD > -EPS) && (APdotAB < ABdotAB + EPS) &&
+          (APdotAD < ADdotAD + EPS)) {
+        intersections[num++] = pts1[i];
+      }
+    }
+  }
+
+  // Reverse the check - check for vertices of rect2 inside rect1
+  {
+    const auto& AB = vec1[0];
+    const auto& DA = vec1[3];
+    auto ABdotAB = dot_2d<T>(AB, AB);
+    auto ADdotAD = dot_2d<T>(DA, DA);
+    for (int i = 0; i < 4; i++) {
+      auto AP = pts2[i] - pts1[0];
+
+      auto APdotAB = dot_2d<T>(AP, AB);
+      auto APdotAD = -dot_2d<T>(AP, DA);
+
+      if ((APdotAB > -EPS) && (APdotAD > -EPS) && (APdotAB < ABdotAB + EPS) &&
+          (APdotAD < ADdotAD + EPS)) {
+        intersections[num++] = pts2[i];
+      }
+    }
+  }
+
+  return num;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE int convex_hull_graham(
+    const Point<T> (&p)[24],
+    const int& num_in,
+    Point<T> (&q)[24],
+    bool shift_to_zero = false) {
+  assert(num_in >= 2);
+
+  // Step 1:
+  // Find point with minimum y
+  // if more than 1 points have the same minimum y,
+  // pick the one with the minimum x.
+  int t = 0;
+  for (int i = 1; i < num_in; i++) {
+    if (p[i].y < p[t].y || (p[i].y == p[t].y && p[i].x < p[t].x)) {
+      t = i;
+    }
+  }
+  auto& start = p[t]; // starting point
+
+  // Step 2:
+  // Subtract starting point from every points (for sorting in the next step)
+  for (int i = 0; i < num_in; i++) {
+    q[i] = p[i] - start;
+  }
+
+  // Swap the starting point to position 0
+  auto tmp = q[0];
+  q[0] = q[t];
+  q[t] = tmp;
+
+  // Step 3:
+  // Sort point 1 ~ num_in according to their relative cross-product values
+  // (essentially sorting according to angles)
+  // If the angles are the same, sort according to their distance to origin
+  T dist[24];
+#if defined(__CUDACC__) || __HCC__ == 1 || __HIP__ == 1
+  // compute distance to origin before sort, and sort them together with the
+  // points
+  for (int i = 0; i < num_in; i++) {
+    dist[i] = dot_2d<T>(q[i], q[i]);
+  }
+
+  // CUDA version
+  // In the future, we can potentially use thrust
+  // for sorting here to improve speed (though not guaranteed)
+  for (int i = 1; i < num_in - 1; i++) {
+    for (int j = i + 1; j < num_in; j++) {
+      T crossProduct = cross_2d<T>(q[i], q[j]);
+      if ((crossProduct < -1e-6) ||
+          (fabs(crossProduct) < 1e-6 && dist[i] > dist[j])) {
+        auto q_tmp = q[i];
+        q[i] = q[j];
+        q[j] = q_tmp;
+        auto dist_tmp = dist[i];
+        dist[i] = dist[j];
+        dist[j] = dist_tmp;
+      }
+    }
+  }
+#else
+  // CPU version
+  // std::sort(
+  //     q + 1, q + num_in, [](const Point<T>& A, const Point<T>& B) -> bool {
+  //       T temp = cross_2d<T>(A, B);
+
+  // if (fabs(temp) < 1e-6) {
+  //   return dot_2d<T>(A, A) < dot_2d<T>(B, B);
+  // } else {
+  //   return temp > 0;
+  // }
+  // });
+  for (int i = 0; i < num_in; i++) {
+    dist[i] = dot_2d<T>(q[i], q[i]);
+  }
+
+  for (int i = 1; i < num_in - 1; i++) {
+    for (int j = i + 1; j < num_in; j++) {
+      T crossProduct = cross_2d<T>(q[i], q[j]);
+      if ((crossProduct < -1e-6) ||
+          (fabs(crossProduct) < 1e-6 && dist[i] > dist[j])) {
+        auto q_tmp = q[i];
+        q[i] = q[j];
+        q[j] = q_tmp;
+        auto dist_tmp = dist[i];
+        dist[i] = dist[j];
+        dist[j] = dist_tmp;
+      }
+    }
+  }
+
+  // compute distance to origin after sort, since the points are now different.
+  for (int i = 0; i < num_in; i++) {
+    dist[i] = dot_2d<T>(q[i], q[i]);
+  }
+
+#endif
+
+  // Step 4:
+  // Make sure there are at least 2 points (that don't overlap with each other)
+  // in the stack
+  int k; // index of the non-overlapped second point
+  for (k = 1; k < num_in; k++) {
+    if (dist[k] > 1e-8) {
+      break;
+    }
+  }
+  if (k == num_in) {
+    // We reach the end, which means the convex hull is just one point
+    q[0] = p[t];
+    return 1;
+  }
+  q[1] = q[k];
+  int m = 2; // 2 points in the stack
+  // Step 5:
+  // Finally we can start the scanning process.
+  // When a non-convex relationship between the 3 points is found
+  // (either concave shape or duplicated points),
+  // we pop the previous point from the stack
+  // until the 3-point relationship is convex again, or
+  // until the stack only contains two points
+  for (int i = k + 1; i < num_in; i++) {
+    while (m > 1) {
+      auto q1 = q[i] - q[m - 2], q2 = q[m - 1] - q[m - 2];
+      // cross_2d() uses FMA and therefore computes round(round(q1.x*q2.y) -
+      // q2.x*q1.y) So it may not return 0 even when q1==q2. Therefore we
+      // compare round(q1.x*q2.y) and round(q2.x*q1.y) directly. (round means
+      // round to nearest floating point).
+      if (q1.x * q2.y >= q2.x * q1.y)
+        m--;
+      else
+        break;
+    }
+    // Using double also helps, but float can solve the issue for now.
+    // while (m > 1 && cross_2d<T, double>(q[i] - q[m - 2], q[m - 1] - q[m - 2])
+    // >= 0) {
+    //     m--;
+    // }
+    q[m++] = q[i];
+  }
+
+  // Step 6 (Optional):
+  // In general sense we need the original coordinates, so we
+  // need to shift the points back (reverting Step 2)
+  // But if we're only interested in getting the area/perimeter of the shape
+  // We can simply return.
+  if (!shift_to_zero) {
+    for (int i = 0; i < m; i++) {
+      q[i] += start;
+    }
+  }
+
+  return m;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE T polygon_area(const Point<T> (&q)[24], const int& m) {
+  if (m <= 2) {
+    return 0;
+  }
+
+  T area = 0;
+  for (int i = 1; i < m - 1; i++) {
+    area += fabs(cross_2d<T>(q[i] - q[0], q[i + 1] - q[0]));
+  }
+
+  return area / 2.0;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE T rotated_boxes_intersection(
+    const RotatedBox<T>& box1,
+    const RotatedBox<T>& box2) {
+  // There are up to 4 x 4 + 4 + 4 = 24 intersections (including dups) returned
+  // from rotated_rect_intersection_pts
+  Point<T> intersectPts[24], orderedPts[24];
+
+  Point<T> pts1[4];
+  Point<T> pts2[4];
+  get_rotated_vertices<T>(box1, pts1);
+  get_rotated_vertices<T>(box2, pts2);
+
+  int num = get_intersection_points<T>(pts1, pts2, intersectPts);
+
+  if (num <= 2) {
+    return 0.0;
+  }
+
+  // Convex Hull to order the intersection points in clockwise order and find
+  // the contour area.
+  int num_convex = convex_hull_graham<T>(intersectPts, num, orderedPts, true);
+  return polygon_area<T>(orderedPts, num_convex);
+}
+
+} // namespace
+
+template <typename T>
+HOST_DEVICE_INLINE T
+single_box_iou_rotated(T const* const box1_raw, T const* const box2_raw) {
+  // shift center to the middle point to achieve higher precision in result
+  RotatedBox<T> box1, box2;
+  auto center_shift_x = (box1_raw[0] + box2_raw[0]) / 2.0;
+  auto center_shift_y = (box1_raw[1] + box2_raw[1]) / 2.0;
+  box1.x_ctr = box1_raw[0] - center_shift_x;
+  box1.y_ctr = box1_raw[1] - center_shift_y;
+  box1.w = box1_raw[2];
+  box1.h = box1_raw[3];
+  box1.a = box1_raw[4];
+  box2.x_ctr = box2_raw[0] - center_shift_x;
+  box2.y_ctr = box2_raw[1] - center_shift_y;
+  box2.w = box2_raw[2];
+  box2.h = box2_raw[3];
+  box2.a = box2_raw[4];
+
+  T area1 = box1.w * box1.h;
+  T area2 = box2.w * box2.h;
+  if (area1 < 1e-14 || area2 < 1e-14) {
+    return 0.f;
+  }
+
+  T intersection = rotated_boxes_intersection<T>(box1, box2);
+  T iou = intersection / (area1 + area2 - intersection);
+  return iou;
+}
+
+} // namespace detectron2
--- a/detectron2/layers/csrc/cocoeval/cocoeval.cpp
+++ b/detectron2/layers/csrc/cocoeval/cocoeval.cpp
+// Copyright (c) Facebook, Inc. and its affiliates.
+#include "cocoeval.h"
+#include <time.h>
+#include <algorithm>
+#include <cstdint>
+#include <numeric>
+
+using namespace pybind11::literals;
+
+namespace detectron2 {
+
+namespace COCOeval {
+
+// Sort detections from highest score to lowest, such that
+// detection_instances[detection_sorted_indices[t]] >=
+// detection_instances[detection_sorted_indices[t+1]].  Use stable_sort to match
+// original COCO API
+void SortInstancesByDetectionScore(
+    const std::vector<InstanceAnnotation>& detection_instances,
+    std::vector<uint64_t>* detection_sorted_indices) {
+  detection_sorted_indices->resize(detection_instances.size());
+  std::iota(
+      detection_sorted_indices->begin(), detection_sorted_indices->end(), 0);
+  std::stable_sort(
+      detection_sorted_indices->begin(),
+      detection_sorted_indices->end(),
+      [&detection_instances](size_t j1, size_t j2) {
+        return detection_instances[j1].score > detection_instances[j2].score;
+      });
+}
+
+// Partition the ground truth objects based on whether or not to ignore them
+// based on area
+void SortInstancesByIgnore(
+    const std::array<double, 2>& area_range,
+    const std::vector<InstanceAnnotation>& ground_truth_instances,
+    std::vector<uint64_t>* ground_truth_sorted_indices,
+    std::vector<bool>* ignores) {
+  ignores->clear();
+  ignores->reserve(ground_truth_instances.size());
+  for (auto o : ground_truth_instances) {
+    ignores->push_back(
+        o.ignore || o.area < area_range[0] || o.area > area_range[1]);
+  }
+
+  ground_truth_sorted_indices->resize(ground_truth_instances.size());
+  std::iota(
+      ground_truth_sorted_indices->begin(),
+      ground_truth_sorted_indices->end(),
+      0);
+  std::stable_sort(
+      ground_truth_sorted_indices->begin(),
+      ground_truth_sorted_indices->end(),
+      [&ignores](size_t j1, size_t j2) {
+        return (int)(*ignores)[j1] < (int)(*ignores)[j2];
+      });
+}
+
+// For each IOU threshold, greedily match each detected instance to a ground
+// truth instance (if possible) and store the results
+void MatchDetectionsToGroundTruth(
+    const std::vector<InstanceAnnotation>& detection_instances,
+    const std::vector<uint64_t>& detection_sorted_indices,
+    const std::vector<InstanceAnnotation>& ground_truth_instances,
+    const std::vector<uint64_t>& ground_truth_sorted_indices,
+    const std::vector<bool>& ignores,
+    const std::vector<std::vector<double>>& ious,
+    const std::vector<double>& iou_thresholds,
+    const std::array<double, 2>& area_range,
+    ImageEvaluation* results) {
+  // Initialize memory to store return data matches and ignore
+  const int num_iou_thresholds = iou_thresholds.size();
+  const int num_ground_truth = ground_truth_sorted_indices.size();
+  const int num_detections = detection_sorted_indices.size();
+  std::vector<uint64_t> ground_truth_matches(
+      num_iou_thresholds * num_ground_truth, 0);
+  std::vector<uint64_t>& detection_matches = results->detection_matches;
+  std::vector<bool>& detection_ignores = results->detection_ignores;
+  std::vector<bool>& ground_truth_ignores = results->ground_truth_ignores;
+  detection_matches.resize(num_iou_thresholds * num_detections, 0);
+  detection_ignores.resize(num_iou_thresholds * num_detections, false);
+  ground_truth_ignores.resize(num_ground_truth);
+  for (auto g = 0; g < num_ground_truth; ++g) {
+    ground_truth_ignores[g] = ignores[ground_truth_sorted_indices[g]];
+  }
+
+  for (auto t = 0; t < num_iou_thresholds; ++t) {
+    for (auto d = 0; d < num_detections; ++d) {
+      // information about best match so far (match=-1 -> unmatched)
+      double best_iou = std::min(iou_thresholds[t], 1 - 1e-10);
+      int match = -1;
+      for (auto g = 0; g < num_ground_truth; ++g) {
+        // if this ground truth instance is already matched and not a
+        // crowd, it cannot be matched to another detection
+        if (ground_truth_matches[t * num_ground_truth + g] > 0 &&
+            !ground_truth_instances[ground_truth_sorted_indices[g]].is_crowd) {
+          continue;
+        }
+
+        // if detected instance matched to a regular ground truth
+        // instance, we can break on the first ground truth instance
+        // tagged as ignore (because they are sorted by the ignore tag)
+        if (match >= 0 && !ground_truth_ignores[match] &&
+            ground_truth_ignores[g]) {
+          break;
+        }
+
+        // if IOU overlap is the best so far, store the match appropriately
+        if (ious[d][ground_truth_sorted_indices[g]] >= best_iou) {
+          best_iou = ious[d][ground_truth_sorted_indices[g]];
+          match = g;
+        }
+      }
+      // if match was made, store id of match for both detection and
+      // ground truth
+      if (match >= 0) {
+        detection_ignores[t * num_detections + d] = ground_truth_ignores[match];
+        detection_matches[t * num_detections + d] =
+            ground_truth_instances[ground_truth_sorted_indices[match]].id;
+        ground_truth_matches[t * num_ground_truth + match] =
+            detection_instances[detection_sorted_indices[d]].id;
+      }
+
+      // set unmatched detections outside of area range to ignore
+      const InstanceAnnotation& detection =
+          detection_instances[detection_sorted_indices[d]];
+      detection_ignores[t * num_detections + d] =
+          detection_ignores[t * num_detections + d] ||
+          (detection_matches[t * num_detections + d] == 0 &&
+           (detection.area < area_range[0] || detection.area > area_range[1]));
+    }
+  }
+
+  // store detection score results
+  results->detection_scores.resize(detection_sorted_indices.size());
+  for (size_t d = 0; d < detection_sorted_indices.size(); ++d) {
+    results->detection_scores[d] =
+        detection_instances[detection_sorted_indices[d]].score;
+  }
+}
+
+std::vector<ImageEvaluation> EvaluateImages(
+    const std::vector<std::array<double, 2>>& area_ranges,
+    int max_detections,
+    const std::vector<double>& iou_thresholds,
+    const ImageCategoryInstances<std::vector<double>>& image_category_ious,
+    const ImageCategoryInstances<InstanceAnnotation>&
+        image_category_ground_truth_instances,
+    const ImageCategoryInstances<InstanceAnnotation>&
+        image_category_detection_instances) {
+  const int num_area_ranges = area_ranges.size();
+  const int num_images = image_category_ground_truth_instances.size();
+  const int num_categories =
+      image_category_ious.size() > 0 ? image_category_ious[0].size() : 0;
+  std::vector<uint64_t> detection_sorted_indices;
+  std::vector<uint64_t> ground_truth_sorted_indices;
+  std::vector<bool> ignores;
+  std::vector<ImageEvaluation> results_all(
+      num_images * num_area_ranges * num_categories);
+
+  // Store results for each image, category, and area range combination. Results
+  // for each IOU threshold are packed into the same ImageEvaluation object
+  for (auto i = 0; i < num_images; ++i) {
+    for (auto c = 0; c < num_categories; ++c) {
+      const std::vector<InstanceAnnotation>& ground_truth_instances =
+          image_category_ground_truth_instances[i][c];
+      const std::vector<InstanceAnnotation>& detection_instances =
+          image_category_detection_instances[i][c];
+
+      SortInstancesByDetectionScore(
+          detection_instances, &detection_sorted_indices);
+      if ((int)detection_sorted_indices.size() > max_detections) {
+        detection_sorted_indices.resize(max_detections);
+      }
+
+      for (size_t a = 0; a < area_ranges.size(); ++a) {
+        SortInstancesByIgnore(
+            area_ranges[a],
+            ground_truth_instances,
+            &ground_truth_sorted_indices,
+            &ignores);
+
+        MatchDetectionsToGroundTruth(
+            detection_instances,
+            detection_sorted_indices,
+            ground_truth_instances,
+            ground_truth_sorted_indices,
+            ignores,
+            image_category_ious[i][c],
+            iou_thresholds,
+            area_ranges[a],
+            &results_all
+                [c * num_area_ranges * num_images + a * num_images + i]);
+      }
+    }
+  }
+
+  return results_all;
+}
+
+// Convert a python list to a vector
+template <typename T>
+std::vector<T> list_to_vec(const py::list& l) {
+  std::vector<T> v(py::len(l));
+  for (int i = 0; i < (int)py::len(l); ++i) {
+    v[i] = l[i].cast<T>();
+  }
+  return v;
+}
+
+// Helper function to Accumulate()
+// Considers the evaluation results applicable to a particular category, area
+// range, and max_detections parameter setting, which begin at
+// evaluations[evaluation_index].  Extracts a sorted list of length n of all
+// applicable detection instances concatenated across all images in the dataset,
+// which are represented by the outputs evaluation_indices, detection_scores,
+// image_detection_indices, and detection_sorted_indices--all of which are
+// length n. evaluation_indices[i] stores the applicable index into
+// evaluations[] for instance i, which has detection score detection_score[i],
+// and is the image_detection_indices[i]'th of the list of detections
+// for the image containing i.  detection_sorted_indices[] defines a sorted
+// permutation of the 3 other outputs
+int BuildSortedDetectionList(
+    const std::vector<ImageEvaluation>& evaluations,
+    const int64_t evaluation_index,
+    const int64_t num_images,
+    const int max_detections,
+    std::vector<uint64_t>* evaluation_indices,
+    std::vector<double>* detection_scores,
+    std::vector<uint64_t>* detection_sorted_indices,
+    std::vector<uint64_t>* image_detection_indices) {
+  assert(evaluations.size() >= evaluation_index + num_images);
+
+  // Extract a list of object instances of the applicable category, area
+  // range, and max detections requirements such that they can be sorted
+  image_detection_indices->clear();
+  evaluation_indices->clear();
+  detection_scores->clear();
+  image_detection_indices->reserve(num_images * max_detections);
+  evaluation_indices->reserve(num_images * max_detections);
+  detection_scores->reserve(num_images * max_detections);
+  int num_valid_ground_truth = 0;
+  for (auto i = 0; i < num_images; ++i) {
+    const ImageEvaluation& evaluation = evaluations[evaluation_index + i];
+
+    for (int d = 0;
+         d < (int)evaluation.detection_scores.size() && d < max_detections;
+         ++d) { // detected instances
+      evaluation_indices->push_back(evaluation_index + i);
+      image_detection_indices->push_back(d);
+      detection_scores->push_back(evaluation.detection_scores[d]);
+    }
+    for (auto ground_truth_ignore : evaluation.ground_truth_ignores) {
+      if (!ground_truth_ignore) {
+        ++num_valid_ground_truth;
+      }
+    }
+  }
+
+  // Sort detections by decreasing score, using stable sort to match
+  // python implementation
+  detection_sorted_indices->resize(detection_scores->size());
+  std::iota(
+      detection_sorted_indices->begin(), detection_sorted_indices->end(), 0);
+  std::stable_sort(
+      detection_sorted_indices->begin(),
+      detection_sorted_indices->end(),
+      [&detection_scores](size_t j1, size_t j2) {
+        return (*detection_scores)[j1] > (*detection_scores)[j2];
+      });
+
+  return num_valid_ground_truth;
+}
+
+// Helper function to Accumulate()
+// Compute a precision recall curve given a sorted list of detected instances
+// encoded in evaluations, evaluation_indices, detection_scores,
+// detection_sorted_indices, image_detection_indices (see
+// BuildSortedDetectionList()). Using vectors precisions and recalls
+// and temporary storage, output the results into precisions_out, recalls_out,
+// and scores_out, which are large buffers containing many precion/recall curves
+// for all possible parameter settings, with precisions_out_index and
+// recalls_out_index defining the applicable indices to store results.
+void ComputePrecisionRecallCurve(
+    const int64_t precisions_out_index,
+    const int64_t precisions_out_stride,
+    const int64_t recalls_out_index,
+    const std::vector<double>& recall_thresholds,
+    const int iou_threshold_index,
+    const int num_iou_thresholds,
+    const int num_valid_ground_truth,
+    const std::vector<ImageEvaluation>& evaluations,
+    const std::vector<uint64_t>& evaluation_indices,
+    const std::vector<double>& detection_scores,
+    const std::vector<uint64_t>& detection_sorted_indices,
+    const std::vector<uint64_t>& image_detection_indices,
+    std::vector<double>* precisions,
+    std::vector<double>* recalls,
+    std::vector<double>* precisions_out,
+    std::vector<double>* scores_out,
+    std::vector<double>* recalls_out) {
+  assert(recalls_out->size() > recalls_out_index);
+
+  // Compute precision/recall for each instance in the sorted list of detections
+  int64_t true_positives_sum = 0, false_positives_sum = 0;
+  precisions->clear();
+  recalls->clear();
+  precisions->reserve(detection_sorted_indices.size());
+  recalls->reserve(detection_sorted_indices.size());
+  assert(!evaluations.empty() || detection_sorted_indices.empty());
+  for (auto detection_sorted_index : detection_sorted_indices) {
+    const ImageEvaluation& evaluation =
+        evaluations[evaluation_indices[detection_sorted_index]];
+    const auto num_detections =
+        evaluation.detection_matches.size() / num_iou_thresholds;
+    const auto detection_index = iou_threshold_index * num_detections +
+        image_detection_indices[detection_sorted_index];
+    assert(evaluation.detection_matches.size() > detection_index);
+    assert(evaluation.detection_ignores.size() > detection_index);
+    const int64_t detection_match =
+        evaluation.detection_matches[detection_index];
+    const bool detection_ignores =
+        evaluation.detection_ignores[detection_index];
+    const auto true_positive = detection_match > 0 && !detection_ignores;
+    const auto false_positive = detection_match == 0 && !detection_ignores;
+    if (true_positive) {
+      ++true_positives_sum;
+    }
+    if (false_positive) {
+      ++false_positives_sum;
+    }
+
+    const double recall =
+        static_cast<double>(true_positives_sum) / num_valid_ground_truth;
+    recalls->push_back(recall);
+    const int64_t num_valid_detections =
+        true_positives_sum + false_positives_sum;
+    const double precision = num_valid_detections > 0
+        ? static_cast<double>(true_positives_sum) / num_valid_detections
+        : 0.0;
+    precisions->push_back(precision);
+  }
+
+  (*recalls_out)[recalls_out_index] = !recalls->empty() ? recalls->back() : 0;
+
+  for (int64_t i = static_cast<int64_t>(precisions->size()) - 1; i > 0; --i) {
+    if ((*precisions)[i] > (*precisions)[i - 1]) {
+      (*precisions)[i - 1] = (*precisions)[i];
+    }
+  }
+
+  // Sample the per instance precision/recall list at each recall threshold
+  for (size_t r = 0; r < recall_thresholds.size(); ++r) {
+    // first index in recalls >= recall_thresholds[r]
+    std::vector<double>::iterator low = std::lower_bound(
+        recalls->begin(), recalls->end(), recall_thresholds[r]);
+    size_t precisions_index = low - recalls->begin();
+
+    const auto results_ind = precisions_out_index + r * precisions_out_stride;
+    assert(results_ind < precisions_out->size());
+    assert(results_ind < scores_out->size());
+    if (precisions_index < precisions->size()) {
+      (*precisions_out)[results_ind] = (*precisions)[precisions_index];
+      (*scores_out)[results_ind] =
+          detection_scores[detection_sorted_indices[precisions_index]];
+    } else {
+      (*precisions_out)[results_ind] = 0;
+      (*scores_out)[results_ind] = 0;
+    }
+  }
+}
+py::dict Accumulate(
+    const py::object& params,
+    const std::vector<ImageEvaluation>& evaluations) {
+  const std::vector<double> recall_thresholds =
+      list_to_vec<double>(params.attr("recThrs"));
+  const std::vector<int> max_detections =
+      list_to_vec<int>(params.attr("maxDets"));
+  const int num_iou_thresholds = py::len(params.attr("iouThrs"));
+  const int num_recall_thresholds = py::len(params.attr("recThrs"));
+  const int num_categories = params.attr("useCats").cast<int>() == 1
+      ? py::len(params.attr("catIds"))
+      : 1;
+  const int num_area_ranges = py::len(params.attr("areaRng"));
+  const int num_max_detections = py::len(params.attr("maxDets"));
+  const int num_images = py::len(params.attr("imgIds"));
+
+  std::vector<double> precisions_out(
+      num_iou_thresholds * num_recall_thresholds * num_categories *
+          num_area_ranges * num_max_detections,
+      -1);
+  std::vector<double> recalls_out(
+      num_iou_thresholds * num_categories * num_area_ranges *
+          num_max_detections,
+      -1);
+  std::vector<double> scores_out(
+      num_iou_thresholds * num_recall_thresholds * num_categories *
+          num_area_ranges * num_max_detections,
+      -1);
+
+  // Consider the list of all detected instances in the entire dataset in one
+  // large list.  evaluation_indices, detection_scores,
+  // image_detection_indices, and detection_sorted_indices all have the same
+  // length as this list, such that each entry corresponds to one detected
+  // instance
+  std::vector<uint64_t> evaluation_indices; // indices into evaluations[]
+  std::vector<double> detection_scores; // detection scores of each instance
+  std::vector<uint64_t> detection_sorted_indices; // sorted indices of all
+                                                  // instances in the dataset
+  std::vector<uint64_t>
+      image_detection_indices; // indices into the list of detected instances in
+                               // the same image as each instance
+  std::vector<double> precisions, recalls;
+
+  for (auto c = 0; c < num_categories; ++c) {
+    for (auto a = 0; a < num_area_ranges; ++a) {
+      for (auto m = 0; m < num_max_detections; ++m) {
+        // The COCO PythonAPI assumes evaluations[] (the return value of
+        // COCOeval::EvaluateImages() is one long list storing results for each
+        // combination of category, area range, and image id, with categories in
+        // the outermost loop and images in the innermost loop.
+        const int64_t evaluations_index =
+            c * num_area_ranges * num_images + a * num_images;
+        int num_valid_ground_truth = BuildSortedDetectionList(
+            evaluations,
+            evaluations_index,
+            num_images,
+            max_detections[m],
+            &evaluation_indices,
+            &detection_scores,
+            &detection_sorted_indices,
+            &image_detection_indices);
+
+        if (num_valid_ground_truth == 0) {
+          continue;
+        }
+
+        for (auto t = 0; t < num_iou_thresholds; ++t) {
+          // recalls_out is a flattened vectors representing a
+          // num_iou_thresholds X num_categories X num_area_ranges X
+          // num_max_detections matrix
+          const int64_t recalls_out_index =
+              t * num_categories * num_area_ranges * num_max_detections +
+              c * num_area_ranges * num_max_detections +
+              a * num_max_detections + m;
+
+          // precisions_out and scores_out are flattened vectors
+          // representing a num_iou_thresholds X num_recall_thresholds X
+          // num_categories X num_area_ranges X num_max_detections matrix
+          const int64_t precisions_out_stride =
+              num_categories * num_area_ranges * num_max_detections;
+          const int64_t precisions_out_index = t * num_recall_thresholds *
+                  num_categories * num_area_ranges * num_max_detections +
+              c * num_area_ranges * num_max_detections +
+              a * num_max_detections + m;
+
+          ComputePrecisionRecallCurve(
+              precisions_out_index,
+              precisions_out_stride,
+              recalls_out_index,
+              recall_thresholds,
+              t,
+              num_iou_thresholds,
+              num_valid_ground_truth,
+              evaluations,
+              evaluation_indices,
+              detection_scores,
+              detection_sorted_indices,
+              image_detection_indices,
+              &precisions,
+              &recalls,
+              &precisions_out,
+              &scores_out,
+              &recalls_out);
+        }
+      }
+    }
+  }
+
+  time_t rawtime;
+  struct tm local_time;
+  std::array<char, 200> buffer;
+  time(&rawtime);
+#ifdef _WIN32
+  localtime_s(&local_time, &rawtime);
+#else
+  localtime_r(&rawtime, &local_time);
+#endif
+  strftime(
+      buffer.data(), 200, "%Y-%m-%d %H:%num_max_detections:%S", &local_time);
+  return py::dict(
+      "params"_a = params,
+      "counts"_a = std::vector<int64_t>(
+          {num_iou_thresholds,
+           num_recall_thresholds,
+           num_categories,
+           num_area_ranges,
+           num_max_detections}),
+      "date"_a = buffer,
+      "precision"_a = precisions_out,
+      "recall"_a = recalls_out,
+      "scores"_a = scores_out);
+}
+
+} // namespace COCOeval
+
+} // namespace detectron2
--- a/detectron2/layers/csrc/cocoeval/cocoeval.h
+++ b/detectron2/layers/csrc/cocoeval/cocoeval.h
+// Copyright (c) Facebook, Inc. and its affiliates.
+#pragma once
+
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <pybind11/stl_bind.h>
+#include <vector>
+
+namespace py = pybind11;
+
+namespace detectron2 {
+
+namespace COCOeval {
+
+// Annotation data for a single object instance in an image
+struct InstanceAnnotation {
+  InstanceAnnotation(
+      uint64_t id,
+      double score,
+      double area,
+      bool is_crowd,
+      bool ignore)
+      : id{id}, score{score}, area{area}, is_crowd{is_crowd}, ignore{ignore} {}
+  uint64_t id;
+  double score = 0.;
+  double area = 0.;
+  bool is_crowd = false;
+  bool ignore = false;
+};
+
+// Stores intermediate results for evaluating detection results for a single
+// image that has D detected instances and G ground truth instances. This stores
+// matches between detected and ground truth instances
+struct ImageEvaluation {
+  // For each of the D detected instances, the id of the matched ground truth
+  // instance, or 0 if unmatched
+  std::vector<uint64_t> detection_matches;
+
+  // The detection score of each of the D detected instances
+  std::vector<double> detection_scores;
+
+  // Marks whether or not each of G instances was ignored from evaluation (e.g.,
+  // because it's outside area_range)
+  std::vector<bool> ground_truth_ignores;
+
+  // Marks whether or not each of D instances was ignored from evaluation (e.g.,
+  // because it's outside aRng)
+  std::vector<bool> detection_ignores;
+};
+
+template <class T>
+using ImageCategoryInstances = std::vector<std::vector<std::vector<T>>>;
+
+// C++ implementation of COCO API cocoeval.py::COCOeval.evaluateImg().  For each
+// combination of image, category, area range settings, and IOU thresholds to
+// evaluate, it matches detected instances to ground truth instances and stores
+// the results into a vector of ImageEvaluation results, which will be
+// interpreted by the COCOeval::Accumulate() function to produce precion-recall
+// curves.  The parameters of nested vectors have the following semantics:
+//   image_category_ious[i][c][d][g] is the intersection over union of the d'th
+//     detected instance and g'th ground truth instance of
+//     category category_ids[c] in image image_ids[i]
+//   image_category_ground_truth_instances[i][c] is a vector of ground truth
+//     instances in image image_ids[i] of category category_ids[c]
+//   image_category_detection_instances[i][c] is a vector of detected
+//     instances in image image_ids[i] of category category_ids[c]
+std::vector<ImageEvaluation> EvaluateImages(
+    const std::vector<std::array<double, 2>>& area_ranges, // vector of 2-tuples
+    int max_detections,
+    const std::vector<double>& iou_thresholds,
+    const ImageCategoryInstances<std::vector<double>>& image_category_ious,
+    const ImageCategoryInstances<InstanceAnnotation>&
+        image_category_ground_truth_instances,
+    const ImageCategoryInstances<InstanceAnnotation>&
+        image_category_detection_instances);
+
+// C++ implementation of COCOeval.accumulate(), which generates precision
+// recall curves for each set of category, IOU threshold, detection area range,
+// and max number of detections parameters.  It is assumed that the parameter
+// evaluations is the return value of the functon COCOeval::EvaluateImages(),
+// which was called with the same parameter settings params
+py::dict Accumulate(
+    const py::object& params,
+    const std::vector<ImageEvaluation>& evalutations);
+
+} // namespace COCOeval
+} // namespace detectron2
--- a/detectron2/layers/csrc/cuda_version.cu
+++ b/detectron2/layers/csrc/cuda_version.cu
+// Copyright (c) Facebook, Inc. and its affiliates.
+
+#include <cuda_runtime_api.h>
+
+namespace detectron2 {
+int get_cudart_version() {
+// Not a ROCM platform: Either HIP is not used, or
+// it is used, but platform is not ROCM (i.e. it is CUDA)
+#if !defined(__HIP_PLATFORM_AMD__)
+  return CUDART_VERSION;
+#else
+  int version = 0;
+
+#if HIP_VERSION_MAJOR != 0
+  // Create a convention similar to that of CUDA, as assumed by other
+  // parts of the code.
+
+  version = HIP_VERSION_MINOR;
+  version += (HIP_VERSION_MAJOR * 100);
+#else
+  hipRuntimeGetVersion(&version);
+#endif
+  return version;
+#endif
+}
+} // namespace detectron2
--- a/detectron2/layers/csrc/deformable/deform_conv.h
+++ b/detectron2/layers/csrc/deformable/deform_conv.h
+// Copyright (c) Facebook, Inc. and its affiliates.
+#pragma once
+#include <torch/types.h>
+
+namespace detectron2 {
+
+#if defined(WITH_CUDA) || defined(WITH_HIP)
+int deform_conv_forward_cuda(
+    at::Tensor input,
+    at::Tensor weight,
+    at::Tensor offset,
+    at::Tensor output,
+    at::Tensor columns,
+    at::Tensor ones,
+    int kW,
+    int kH,
+    int dW,
+    int dH,
+    int padW,
+    int padH,
+    int dilationW,
+    int dilationH,
+    int group,
+    int deformable_group,
+    int im2col_step);
+
+int deform_conv_backward_input_cuda(
+    at::Tensor input,
+    at::Tensor offset,
+    at::Tensor gradOutput,
+    at::Tensor gradInput,
+    at::Tensor gradOffset,
+    at::Tensor weight,
+    at::Tensor columns,
+    int kW,
+    int kH,
+    int dW,
+    int dH,
+    int padW,
+    int padH,
+    int dilationW,
+    int dilationH,
+    int group,
+    int deformable_group,
+    int im2col_step);
+
+int deform_conv_backward_parameters_cuda(
+    at::Tensor input,
+    at::Tensor offset,
+    at::Tensor gradOutput,
+    at::Tensor gradWeight, // at::Tensor gradBias,
+    at::Tensor columns,
+    at::Tensor ones,
+    int kW,
+    int kH,
+    int dW,
+    int dH,
+    int padW,
+    int padH,
+    int dilationW,
+    int dilationH,
+    int group,
+    int deformable_group,
+    float scale,
+    int im2col_step);
+
+void modulated_deform_conv_cuda_forward(
+    at::Tensor input,
+    at::Tensor weight,
+    at::Tensor bias,
+    at::Tensor ones,
+    at::Tensor offset,
+    at::Tensor mask,
+    at::Tensor output,
+    at::Tensor columns,
+    int kernel_h,
+    int kernel_w,
+    const int stride_h,
+    const int stride_w,
+    const int pad_h,
+    const int pad_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int group,
+    const int deformable_group,
+    const bool with_bias);
+
+void modulated_deform_conv_cuda_backward(
+    at::Tensor input,
+    at::Tensor weight,
+    at::Tensor bias,
+    at::Tensor ones,
+    at::Tensor offset,
+    at::Tensor mask,
+    at::Tensor columns,
+    at::Tensor grad_input,
+    at::Tensor grad_weight,
+    at::Tensor grad_bias,
+    at::Tensor grad_offset,
+    at::Tensor grad_mask,
+    at::Tensor grad_output,
+    int kernel_h,
+    int kernel_w,
+    int stride_h,
+    int stride_w,
+    int pad_h,
+    int pad_w,
+    int dilation_h,
+    int dilation_w,
+    int group,
+    int deformable_group,
+    const bool with_bias);
+
+#endif
+
+inline int deform_conv_forward(
+    at::Tensor input,
+    at::Tensor weight,
+    at::Tensor offset,
+    at::Tensor output,
+    at::Tensor columns,
+    at::Tensor ones,
+    int kW,
+    int kH,
+    int dW,
+    int dH,
+    int padW,
+    int padH,
+    int dilationW,
+    int dilationH,
+    int group,
+    int deformable_group,
+    int im2col_step) {
+  if (input.is_cuda()) {
+#if defined(WITH_CUDA) || defined(WITH_HIP)
+    TORCH_CHECK(weight.is_cuda(), "weight tensor is not on GPU!");
+    TORCH_CHECK(offset.is_cuda(), "offset tensor is not on GPU!");
+    return deform_conv_forward_cuda(
+        input,
+        weight,
+        offset,
+        output,
+        columns,
+        ones,
+        kW,
+        kH,
+        dW,
+        dH,
+        padW,
+        padH,
+        dilationW,
+        dilationH,
+        group,
+        deformable_group,
+        im2col_step);
+#else
+    AT_ERROR("Detectron2 is not compiled with GPU support!");
+#endif
+  }
+  AT_ERROR("This operator is not implemented on CPU");
+}
+
+inline int deform_conv_backward_input(
+    at::Tensor input,
+    at::Tensor offset,
+    at::Tensor gradOutput,
+    at::Tensor gradInput,
+    at::Tensor gradOffset,
+    at::Tensor weight,
+    at::Tensor columns,
+    int kW,
+    int kH,
+    int dW,
+    int dH,
+    int padW,
+    int padH,
+    int dilationW,
+    int dilationH,
+    int group,
+    int deformable_group,
+    int im2col_step) {
+  if (gradOutput.is_cuda()) {
+#if defined(WITH_CUDA) || defined(WITH_HIP)
+    TORCH_CHECK(input.is_cuda(), "input tensor is not on GPU!");
+    TORCH_CHECK(weight.is_cuda(), "weight tensor is not on GPU!");
+    TORCH_CHECK(offset.is_cuda(), "offset tensor is not on GPU!");
+    return deform_conv_backward_input_cuda(
+        input,
+        offset,
+        gradOutput,
+        gradInput,
+        gradOffset,
+        weight,
+        columns,
+        kW,
+        kH,
+        dW,
+        dH,
+        padW,
+        padH,
+        dilationW,
+        dilationH,
+        group,
+        deformable_group,
+        im2col_step);
+#else
+    AT_ERROR("Detectron2 is not compiled with GPU support!");
+#endif
+  }
+  AT_ERROR("This operator is not implemented on CPU");
+}
+
+inline int deform_conv_backward_filter(
+    at::Tensor input,
+    at::Tensor offset,
+    at::Tensor gradOutput,
+    at::Tensor gradWeight, // at::Tensor gradBias,
+    at::Tensor columns,
+    at::Tensor ones,
+    int kW,
+    int kH,
+    int dW,
+    int dH,
+    int padW,
+    int padH,
+    int dilationW,
+    int dilationH,
+    int group,
+    int deformable_group,
+    float scale,
+    int im2col_step) {
+  if (gradOutput.is_cuda()) {
+#if defined(WITH_CUDA) || defined(WITH_HIP)
+    TORCH_CHECK(input.is_cuda(), "input tensor is not on GPU!");
+    TORCH_CHECK(offset.is_cuda(), "offset tensor is not on GPU!");
+    return deform_conv_backward_parameters_cuda(
+        input,
+        offset,
+        gradOutput,
+        gradWeight,
+        columns,
+        ones,
+        kW,
+        kH,
+        dW,
+        dH,
+        padW,
+        padH,
+        dilationW,
+        dilationH,
+        group,
+        deformable_group,
+        scale,
+        im2col_step);
+#else
+    AT_ERROR("Detectron2 is not compiled with GPU support!");
+#endif
+  }
+  AT_ERROR("This operator is not implemented on CPU");
+}
+
+inline void modulated_deform_conv_forward(
+    at::Tensor input,
+    at::Tensor weight,
+    at::Tensor bias,
+    at::Tensor ones,
+    at::Tensor offset,
+    at::Tensor mask,
+    at::Tensor output,
+    at::Tensor columns,
+    int kernel_h,
+    int kernel_w,
+    const int stride_h,
+    const int stride_w,
+    const int pad_h,
+    const int pad_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int group,
+    const int deformable_group,
+    const bool with_bias) {
+  if (input.is_cuda()) {
+#if defined(WITH_CUDA) || defined(WITH_HIP)
+    TORCH_CHECK(weight.is_cuda(), "weight tensor is not on GPU!");
+    TORCH_CHECK(bias.is_cuda(), "bias tensor is not on GPU!");
+    TORCH_CHECK(offset.is_cuda(), "offset tensor is not on GPU!");
+    return modulated_deform_conv_cuda_forward(
+        input,
+        weight,
+        bias,
+        ones,
+        offset,
+        mask,
+        output,
+        columns,
+        kernel_h,
+        kernel_w,
+        stride_h,
+        stride_w,
+        pad_h,
+        pad_w,
+        dilation_h,
+        dilation_w,
+        group,
+        deformable_group,
+        with_bias);
+#else
+    AT_ERROR("Detectron2 is not compiled with GPU support!");
+#endif
+  }
+  AT_ERROR("This operator is not implemented on CPU");
+}
+
+inline void modulated_deform_conv_backward(
+    at::Tensor input,
+    at::Tensor weight,
+    at::Tensor bias,
+    at::Tensor ones,
+    at::Tensor offset,
+    at::Tensor mask,
+    at::Tensor columns,
+    at::Tensor grad_input,
+    at::Tensor grad_weight,
+    at::Tensor grad_bias,
+    at::Tensor grad_offset,
+    at::Tensor grad_mask,
+    at::Tensor grad_output,
+    int kernel_h,
+    int kernel_w,
+    int stride_h,
+    int stride_w,
+    int pad_h,
+    int pad_w,
+    int dilation_h,
+    int dilation_w,
+    int group,
+    int deformable_group,
+    const bool with_bias) {
+  if (grad_output.is_cuda()) {
+#if defined(WITH_CUDA) || defined(WITH_HIP)
+    TORCH_CHECK(input.is_cuda(), "input tensor is not on GPU!");
+    TORCH_CHECK(weight.is_cuda(), "weight tensor is not on GPU!");
+    TORCH_CHECK(bias.is_cuda(), "bias tensor is not on GPU!");
+    TORCH_CHECK(offset.is_cuda(), "offset tensor is not on GPU!");
+    return modulated_deform_conv_cuda_backward(
+        input,
+        weight,
+        bias,
+        ones,
+        offset,
+        mask,
+        columns,
+        grad_input,
+        grad_weight,
+        grad_bias,
+        grad_offset,
+        grad_mask,
+        grad_output,
+        kernel_h,
+        kernel_w,
+        stride_h,
+        stride_w,
+        pad_h,
+        pad_w,
+        dilation_h,
+        dilation_w,
+        group,
+        deformable_group,
+        with_bias);
+#else
+    AT_ERROR("Detectron2 is not compiled with GPU support!");
+#endif
+  }
+  AT_ERROR("This operator is not implemented on CPU");
+}
+
+} // namespace detectron2
--- a/detectron2/layers/csrc/deformable/deform_conv_cuda.cu
+++ b/detectron2/layers/csrc/deformable/deform_conv_cuda.cu
+// Copyright (c) Facebook, Inc. and its affiliates.
+
+// modified from
+// https://github.com/open-mmlab/mmdetection/blob/master/mmdet/ops/dcn/src/deform_conv_cuda.cpp
+// Original license: Apache 2.0
+
+// modify from
+// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda.c
+// Original license: Apache 2.0
+
+#include <torch/types.h>
+
+#include "deform_conv.h"
+
+#include <cmath>
+#include <vector>
+
+namespace detectron2 {
+
+void deformable_im2col(
+    const at::Tensor data_im,
+    const at::Tensor data_offset,
+    const int channels,
+    const int height,
+    const int width,
+    const int ksize_h,
+    const int ksize_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int parallel_imgs,
+    const int deformable_group,
+    at::Tensor data_col);
+
+void deformable_col2im(
+    const at::Tensor data_col,
+    const at::Tensor data_offset,
+    const int channels,
+    const int height,
+    const int width,
+    const int ksize_h,
+    const int ksize_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int parallel_imgs,
+    const int deformable_group,
+    at::Tensor grad_im);
+
+void deformable_col2im_coord(
+    const at::Tensor data_col,
+    const at::Tensor data_im,
+    const at::Tensor data_offset,
+    const int channels,
+    const int height,
+    const int width,
+    const int ksize_h,
+    const int ksize_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int parallel_imgs,
+    const int deformable_group,
+    at::Tensor grad_offset);
+
+void modulated_deformable_im2col_cuda(
+    const at::Tensor data_im,
+    const at::Tensor data_offset,
+    const at::Tensor data_mask,
+    const int batch_size,
+    const int channels,
+    const int height_im,
+    const int width_im,
+    const int height_col,
+    const int width_col,
+    const int kernel_h,
+    const int kenerl_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int deformable_group,
+    at::Tensor data_col);
+
+void modulated_deformable_col2im_cuda(
+    const at::Tensor data_col,
+    const at::Tensor data_offset,
+    const at::Tensor data_mask,
+    const int batch_size,
+    const int channels,
+    const int height_im,
+    const int width_im,
+    const int height_col,
+    const int width_col,
+    const int kernel_h,
+    const int kenerl_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int deformable_group,
+    at::Tensor grad_im);
+
+void modulated_deformable_col2im_coord_cuda(
+    const at::Tensor data_col,
+    const at::Tensor data_im,
+    const at::Tensor data_offset,
+    const at::Tensor data_mask,
+    const int batch_size,
+    const int channels,
+    const int height_im,
+    const int width_im,
+    const int height_col,
+    const int width_col,
+    const int kernel_h,
+    const int kenerl_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int deformable_group,
+    at::Tensor grad_offset,
+    at::Tensor grad_mask);
+
+void shape_check(
+    at::Tensor input,
+    at::Tensor offset,
+    at::Tensor* gradOutput,
+    at::Tensor weight,
+    int kH,
+    int kW,
+    int dH,
+    int dW,
+    int padH,
+    int padW,
+    int dilationH,
+    int dilationW,
+    int group,
+    int deformable_group) {
+  TORCH_CHECK(
+      weight.ndimension() == 4,
+      "4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, "
+      "but got: %s",
+      weight.ndimension());
+
+  TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
+
+  TORCH_CHECK(
+      kW > 0 && kH > 0,
+      "kernel size should be greater than zero, but got kH: %d kW: %d",
+      kH,
+      kW);
+
+  TORCH_CHECK(
+      (weight.size(2) == kH && weight.size(3) == kW),
+      "kernel size should be consistent with weight, ",
+      "but got kH: %d kW: %d weight.size(2): %d, weight.size(3): %d",
+      kH,
+      kW,
+      weight.size(2),
+      weight.size(3));
+
+  TORCH_CHECK(
+      dW > 0 && dH > 0,
+      "stride should be greater than zero, but got dH: %d dW: %d",
+      dH,
+      dW);
+
+  TORCH_CHECK(
+      dilationW > 0 && dilationH > 0,
+      "dilation should be greater than 0, but got dilationH: %d dilationW: %d",
+      dilationH,
+      dilationW);
+
+  int ndim = input.ndimension();
+  int dimf = 0;
+  int dimh = 1;
+  int dimw = 2;
+
+  if (ndim == 4) {
+    dimf++;
+    dimh++;
+    dimw++;
+  }
+
+  TORCH_CHECK(
+      ndim == 3 || ndim == 4,
+      "3D or 4D input tensor expected but got: %s",
+      ndim);
+
+  long nInputPlane = weight.size(1) * group;
+  long inputHeight = input.size(dimh);
+  long inputWidth = input.size(dimw);
+  long nOutputPlane = weight.size(0);
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+
+  TORCH_CHECK(
+      nInputPlane % deformable_group == 0,
+      "input channels must divide deformable group size");
+
+  if (outputWidth < 1 || outputHeight < 1)
+    AT_ERROR(
+        "Given input size: (%ld x %ld x %ld). "
+        "Calculated output size: (%ld x %ld x %ld). Output size is too small",
+        nInputPlane,
+        inputHeight,
+        inputWidth,
+        nOutputPlane,
+        outputHeight,
+        outputWidth);
+
+  TORCH_CHECK(
+      input.size(1) == nInputPlane,
+      "invalid number of input planes, expected: %d, but got: %d",
+      nInputPlane,
+      input.size(1));
+
+  TORCH_CHECK(
+      (inputHeight + 2 * padH >= kH && inputWidth + 2 * padW >= kW),
+      "input image is smaller than kernel");
+
+  TORCH_CHECK(
+      (offset.size(2) == outputHeight && offset.size(3) == outputWidth),
+      "invalid spatial size of offset, expected height: %d width: %d, but "
+      "got height: %d width: %d",
+      outputHeight,
+      outputWidth,
+      offset.size(2),
+      offset.size(3));
+
+  TORCH_CHECK(
+      (offset.size(1) == deformable_group * 2 * kH * kW),
+      "invalid number of channels of offset");
+
+  if (gradOutput != NULL) {
+    TORCH_CHECK(
+        gradOutput->size(dimf) == nOutputPlane,
+        "invalid number of gradOutput planes, expected: %d, but got: %d",
+        nOutputPlane,
+        gradOutput->size(dimf));
+
+    TORCH_CHECK(
+        (gradOutput->size(dimh) == outputHeight &&
+         gradOutput->size(dimw) == outputWidth),
+        "invalid size of gradOutput, expected height: %d width: %d , but "
+        "got height: %d width: %d",
+        outputHeight,
+        outputWidth,
+        gradOutput->size(dimh),
+        gradOutput->size(dimw));
+  }
+}
+
+int deform_conv_forward_cuda(
+    at::Tensor input,
+    at::Tensor weight,
+    at::Tensor offset,
+    at::Tensor output,
+    at::Tensor columns,
+    at::Tensor ones,
+    int kW,
+    int kH,
+    int dW,
+    int dH,
+    int padW,
+    int padH,
+    int dilationW,
+    int dilationH,
+    int group,
+    int deformable_group,
+    int im2col_step) {
+  // todo: resize columns to include im2col: done
+  // todo: add im2col_step as input
+  // todo: add new output buffer and transpose it to output (or directly
+  // transpose output) todo: possibly change data indexing because of
+  // parallel_imgs
+
+  shape_check(
+      input,
+      offset,
+      NULL,
+      weight,
+      kH,
+      kW,
+      dH,
+      dW,
+      padH,
+      padW,
+      dilationH,
+      dilationW,
+      group,
+      deformable_group);
+
+  input = input.contiguous();
+  offset = offset.contiguous();
+  weight = weight.contiguous();
+
+  int batch = 1;
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input.unsqueeze_(0);
+    offset.unsqueeze_(0);
+  }
+
+  // todo: assert batchsize dividable by im2col_step
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = weight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
+
+  output = output.view(
+      {batchSize / im2col_step,
+       im2col_step,
+       nOutputPlane,
+       outputHeight,
+       outputWidth});
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < outputHeight * outputWidth) {
+    ones = at::ones({outputHeight, outputWidth}, input.options());
+  }
+
+  input = input.view(
+      {batchSize / im2col_step,
+       im2col_step,
+       nInputPlane,
+       inputHeight,
+       inputWidth});
+  offset = offset.view(
+      {batchSize / im2col_step,
+       im2col_step,
+       deformable_group * 2 * kH * kW,
+       outputHeight,
+       outputWidth});
+
+  at::Tensor output_buffer = at::zeros(
+      {batchSize / im2col_step,
+       nOutputPlane,
+       im2col_step * outputHeight,
+       outputWidth},
+      output.options());
+
+  output_buffer = output_buffer.view(
+      {output_buffer.size(0),
+       group,
+       output_buffer.size(1) / group,
+       output_buffer.size(2),
+       output_buffer.size(3)});
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    deformable_im2col(
+        input[elt],
+        offset[elt],
+        nInputPlane,
+        inputHeight,
+        inputWidth,
+        kH,
+        kW,
+        padH,
+        padW,
+        dH,
+        dW,
+        dilationH,
+        dilationW,
+        im2col_step,
+        deformable_group,
+        columns);
+
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view(
+        {group,
+         weight.size(0) / group,
+         weight.size(1),
+         weight.size(2),
+         weight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      output_buffer[elt][g] = output_buffer[elt][g]
+                                  .flatten(1)
+                                  .addmm_(weight[g].flatten(1), columns[g])
+                                  .view_as(output_buffer[elt][g]);
+    }
+  }
+
+  output_buffer = output_buffer.view(
+      {output_buffer.size(0),
+       output_buffer.size(1) * output_buffer.size(2),
+       output_buffer.size(3),
+       output_buffer.size(4)});
+
+  output_buffer = output_buffer.view(
+      {batchSize / im2col_step,
+       nOutputPlane,
+       im2col_step,
+       outputHeight,
+       outputWidth});
+  output_buffer.transpose_(1, 2);
+  output.copy_(output_buffer);
+  output = output.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    output = output.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+    offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
+  }
+
+  return 1;
+}
+
+int deform_conv_backward_input_cuda(
+    at::Tensor input,
+    at::Tensor offset,
+    at::Tensor gradOutput,
+    at::Tensor gradInput,
+    at::Tensor gradOffset,
+    at::Tensor weight,
+    at::Tensor columns,
+    int kW,
+    int kH,
+    int dW,
+    int dH,
+    int padW,
+    int padH,
+    int dilationW,
+    int dilationH,
+    int group,
+    int deformable_group,
+    int im2col_step) {
+  shape_check(
+      input,
+      offset,
+      &gradOutput,
+      weight,
+      kH,
+      kW,
+      dH,
+      dW,
+      padH,
+      padW,
+      dilationH,
+      dilationW,
+      group,
+      deformable_group);
+
+  input = input.contiguous();
+  offset = offset.contiguous();
+  gradOutput = gradOutput.contiguous();
+  weight = weight.contiguous();
+
+  int batch = 1;
+
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input = input.view({1, input.size(0), input.size(1), input.size(2)});
+    offset = offset.view({1, offset.size(0), offset.size(1), offset.size(2)});
+    gradOutput = gradOutput.view(
+        {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
+  }
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = weight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  TORCH_CHECK((offset.size(0) == batchSize), 3, "invalid batch size of offset");
+  gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  // change order of grad output
+  gradOutput = gradOutput.view(
+      {batchSize / im2col_step,
+       im2col_step,
+       nOutputPlane,
+       outputHeight,
+       outputWidth});
+  gradOutput.transpose_(1, 2);
+
+  gradInput = gradInput.view(
+      {batchSize / im2col_step,
+       im2col_step,
+       nInputPlane,
+       inputHeight,
+       inputWidth});
+  input = input.view(
+      {batchSize / im2col_step,
+       im2col_step,
+       nInputPlane,
+       inputHeight,
+       inputWidth});
+  gradOffset = gradOffset.view(
+      {batchSize / im2col_step,
+       im2col_step,
+       deformable_group * 2 * kH * kW,
+       outputHeight,
+       outputWidth});
+  offset = offset.view(
+      {batchSize / im2col_step,
+       im2col_step,
+       deformable_group * 2 * kH * kW,
+       outputHeight,
+       outputWidth});
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    // divide into groups
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view(
+        {group,
+         weight.size(0) / group,
+         weight.size(1),
+         weight.size(2),
+         weight.size(3)});
+    gradOutput = gradOutput.view(
+        {gradOutput.size(0),
+         group,
+         gradOutput.size(1) / group,
+         gradOutput.size(2),
+         gradOutput.size(3),
+         gradOutput.size(4)});
+
+    for (int g = 0; g < group; g++) {
+      columns[g] = columns[g].addmm_(
+          weight[g].flatten(1).transpose(0, 1),
+          gradOutput[elt][g].flatten(1),
+          0.0f,
+          1.0f);
+    }
+
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    gradOutput = gradOutput.view(
+        {gradOutput.size(0),
+         gradOutput.size(1) * gradOutput.size(2),
+         gradOutput.size(3),
+         gradOutput.size(4),
+         gradOutput.size(5)});
+
+    deformable_col2im_coord(
+        columns,
+        input[elt],
+        offset[elt],
+        nInputPlane,
+        inputHeight,
+        inputWidth,
+        kH,
+        kW,
+        padH,
+        padW,
+        dH,
+        dW,
+        dilationH,
+        dilationW,
+        im2col_step,
+        deformable_group,
+        gradOffset[elt]);
+
+    deformable_col2im(
+        columns,
+        offset[elt],
+        nInputPlane,
+        inputHeight,
+        inputWidth,
+        kH,
+        kW,
+        padH,
+        padW,
+        dH,
+        dW,
+        dilationH,
+        dilationW,
+        im2col_step,
+        deformable_group,
+        gradInput[elt]);
+  }
+
+  gradOutput.transpose_(1, 2);
+  gradOutput =
+      gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  gradOffset = gradOffset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+    gradInput = gradInput.view({nInputPlane, inputHeight, inputWidth});
+    offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
+    gradOffset =
+        gradOffset.view({offset.size(1), offset.size(2), offset.size(3)});
+  }
+
+  return 1;
+}
+
+int deform_conv_backward_parameters_cuda(
+    at::Tensor input,
+    at::Tensor offset,
+    at::Tensor gradOutput,
+    at::Tensor gradWeight, // at::Tensor gradBias,
+    at::Tensor columns,
+    at::Tensor ones,
+    int kW,
+    int kH,
+    int dW,
+    int dH,
+    int padW,
+    int padH,
+    int dilationW,
+    int dilationH,
+    int group,
+    int deformable_group,
+    float scale,
+    int im2col_step) {
+  // todo: transpose and reshape outGrad
+  // todo: reshape columns
+  // todo: add im2col_step as input
+
+  shape_check(
+      input,
+      offset,
+      &gradOutput,
+      gradWeight,
+      kH,
+      kW,
+      dH,
+      dW,
+      padH,
+      padW,
+      dilationH,
+      dilationW,
+      group,
+      deformable_group);
+
+  input = input.contiguous();
+  offset = offset.contiguous();
+  gradOutput = gradOutput.contiguous();
+
+  int batch = 1;
+
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input = input.view(
+        at::IntList({1, input.size(0), input.size(1), input.size(2)}));
+    gradOutput = gradOutput.view(
+        {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
+  }
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = gradWeight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
+
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  gradOutput = gradOutput.view(
+      {batchSize / im2col_step,
+       im2col_step,
+       nOutputPlane,
+       outputHeight,
+       outputWidth});
+  gradOutput.transpose_(1, 2);
+
+  at::Tensor gradOutputBuffer = at::zeros_like(gradOutput);
+  gradOutputBuffer = gradOutputBuffer.view(
+      {batchSize / im2col_step,
+       nOutputPlane,
+       im2col_step,
+       outputHeight,
+       outputWidth});
+  gradOutputBuffer.copy_(gradOutput);
+  // gradOutput is not contiguous, so we do reshape (instead of view) next
+  gradOutputBuffer = gradOutputBuffer.reshape(
+      {batchSize / im2col_step,
+       nOutputPlane,
+       im2col_step * outputHeight,
+       outputWidth});
+
+  gradOutput.transpose_(1, 2);
+  gradOutput =
+      gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  input = input.view(
+      {batchSize / im2col_step,
+       im2col_step,
+       nInputPlane,
+       inputHeight,
+       inputWidth});
+  offset = offset.view(
+      {batchSize / im2col_step,
+       im2col_step,
+       deformable_group * 2 * kH * kW,
+       outputHeight,
+       outputWidth});
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    deformable_im2col(
+        input[elt],
+        offset[elt],
+        nInputPlane,
+        inputHeight,
+        inputWidth,
+        kH,
+        kW,
+        padH,
+        padW,
+        dH,
+        dW,
+        dilationH,
+        dilationW,
+        im2col_step,
+        deformable_group,
+        columns);
+
+    // divide into group
+    gradOutputBuffer = gradOutputBuffer.view(
+        {gradOutputBuffer.size(0),
+         group,
+         gradOutputBuffer.size(1) / group,
+         gradOutputBuffer.size(2),
+         gradOutputBuffer.size(3)});
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    gradWeight = gradWeight.view(
+        {group,
+         gradWeight.size(0) / group,
+         gradWeight.size(1),
+         gradWeight.size(2),
+         gradWeight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      gradWeight[g] = gradWeight[g]
+                          .flatten(1)
+                          .addmm_(
+                              gradOutputBuffer[elt][g].flatten(1),
+                              columns[g].transpose(1, 0),
+                              1.0,
+                              scale)
+                          .view_as(gradWeight[g]);
+    }
+    gradOutputBuffer = gradOutputBuffer.view(
+        {gradOutputBuffer.size(0),
+         gradOutputBuffer.size(1) * gradOutputBuffer.size(2),
+         gradOutputBuffer.size(3),
+         gradOutputBuffer.size(4)});
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    gradWeight = gradWeight.view(
+        {gradWeight.size(0) * gradWeight.size(1),
+         gradWeight.size(2),
+         gradWeight.size(3),
+         gradWeight.size(4)});
+  }
+
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+  }
+
+  return 1;
+}
+
+void modulated_deform_conv_cuda_forward(
+    at::Tensor input,
+    at::Tensor weight,
+    at::Tensor bias,
+    at::Tensor ones,
+    at::Tensor offset,
+    at::Tensor mask,
+    at::Tensor output,
+    at::Tensor columns,
+    int kernel_h,
+    int kernel_w,
+    const int stride_h,
+    const int stride_w,
+    const int pad_h,
+    const int pad_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int group,
+    const int deformable_group,
+    const bool with_bias) {
+  shape_check(
+      input,
+      offset,
+      NULL,
+      weight,
+      kernel_h,
+      kernel_w,
+      stride_h,
+      stride_w,
+      pad_h,
+      pad_w,
+      dilation_h,
+      dilation_w,
+      group,
+      deformable_group);
+
+  TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
+  TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
+
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+
+  const int channels_out = weight.size(0);
+  const int channels_kernel = weight.size(1);
+  const int kernel_h_ = weight.size(2);
+  const int kernel_w_ = weight.size(3);
+
+  if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
+    AT_ERROR(
+        "Input shape and kernel shape wont match: (%d x %d vs %d x %d).",
+        kernel_h_,
+        kernel_w,
+        kernel_h_,
+        kernel_w_);
+  if (channels != channels_kernel * group)
+    AT_ERROR(
+        "Input shape and kernel channels wont match: (%d vs %d).",
+        channels,
+        channels_kernel * group);
+
+  const int height_out =
+      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int width_out =
+      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+  // mask shape check
+  TORCH_CHECK(
+      (mask.size(2) == height_out && mask.size(3) == width_out),
+      "invalid spatial size of mask, expected height: %d width: %d, but "
+      "got height: %d width: %d",
+      height_out,
+      width_out,
+      mask.size(2),
+      mask.size(3));
+
+  TORCH_CHECK(
+      (mask.size(1) == deformable_group * kernel_h * kernel_w),
+      "invalid number of channels of mask");
+
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < height_out * width_out) {
+    // Resize plane and fill with ones...
+    ones = at::ones({height_out, width_out}, input.options());
+  }
+
+  // resize output
+  output = output.view({batch, channels_out, height_out, width_out}).zero_();
+  // resize temporary columns
+  columns = at::zeros(
+      {channels * kernel_h * kernel_w, 1 * height_out * width_out},
+      input.options());
+
+  output = output.view(
+      {output.size(0),
+       group,
+       output.size(1) / group,
+       output.size(2),
+       output.size(3)});
+
+  for (int b = 0; b < batch; b++) {
+    modulated_deformable_im2col_cuda(
+        input[b],
+        offset[b],
+        mask[b],
+        1,
+        channels,
+        height,
+        width,
+        height_out,
+        width_out,
+        kernel_h,
+        kernel_w,
+        pad_h,
+        pad_w,
+        stride_h,
+        stride_w,
+        dilation_h,
+        dilation_w,
+        deformable_group,
+        columns);
+
+    // divide into group
+    weight = weight.view(
+        {group,
+         weight.size(0) / group,
+         weight.size(1),
+         weight.size(2),
+         weight.size(3)});
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+
+    for (int g = 0; g < group; g++) {
+      output[b][g] = output[b][g]
+                         .flatten(1)
+                         .addmm_(weight[g].flatten(1), columns[g])
+                         .view_as(output[b][g]);
+    }
+
+    weight = weight.view(
+        {weight.size(0) * weight.size(1),
+         weight.size(2),
+         weight.size(3),
+         weight.size(4)});
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+  }
+
+  output = output.view(
+      {output.size(0),
+       output.size(1) * output.size(2),
+       output.size(3),
+       output.size(4)});
+
+  if (with_bias) {
+    output += bias.view({1, bias.size(0), 1, 1});
+  }
+}
+
+void modulated_deform_conv_cuda_backward(
+    at::Tensor input,
+    at::Tensor weight,
+    at::Tensor bias,
+    at::Tensor ones,
+    at::Tensor offset,
+    at::Tensor mask,
+    at::Tensor columns,
+    at::Tensor grad_input,
+    at::Tensor grad_weight,
+    at::Tensor grad_bias,
+    at::Tensor grad_offset,
+    at::Tensor grad_mask,
+    at::Tensor grad_output,
+    int kernel_h,
+    int kernel_w,
+    int stride_h,
+    int stride_w,
+    int pad_h,
+    int pad_w,
+    int dilation_h,
+    int dilation_w,
+    int group,
+    int deformable_group,
+    const bool with_bias) {
+  shape_check(
+      input,
+      offset,
+      &grad_output,
+      weight,
+      kernel_h,
+      kernel_w,
+      stride_h,
+      stride_w,
+      pad_h,
+      pad_w,
+      dilation_h,
+      dilation_w,
+      group,
+      deformable_group);
+
+  TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
+  TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
+
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+
+  const int channels_kernel = weight.size(1);
+  const int kernel_h_ = weight.size(2);
+  const int kernel_w_ = weight.size(3);
+  if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
+    AT_ERROR(
+        "Input shape and kernel shape wont match: (%d x %d vs %d x %d).",
+        kernel_h_,
+        kernel_w,
+        kernel_h_,
+        kernel_w_);
+  if (channels != channels_kernel * group)
+    AT_ERROR(
+        "Input shape and kernel channels wont match: (%d vs %d).",
+        channels,
+        channels_kernel * group);
+
+  const int height_out =
+      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int width_out =
+      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+  // mask shape check
+  TORCH_CHECK(
+      (mask.size(2) == height_out && mask.size(3) == width_out),
+      "invalid spatial size of mask, expected height: %d width: %d, but "
+      "got height: %d width: %d",
+      height_out,
+      width_out,
+      mask.size(2),
+      mask.size(3));
+
+  TORCH_CHECK(
+      (mask.size(1) == deformable_group * kernel_h * kernel_w),
+      "invalid number of channels of mask");
+
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < height_out * width_out) {
+    // Resize plane and fill with ones...
+    ones = at::ones({height_out, width_out}, input.options());
+  }
+
+  grad_input = grad_input.view({batch, channels, height, width});
+  columns = at::zeros(
+      {channels * kernel_h * kernel_w, height_out * width_out},
+      input.options());
+
+  grad_output = grad_output.view(
+      {grad_output.size(0),
+       group,
+       grad_output.size(1) / group,
+       grad_output.size(2),
+       grad_output.size(3)});
+
+  for (int b = 0; b < batch; b++) {
+    // divide int group
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view(
+        {group,
+         weight.size(0) / group,
+         weight.size(1),
+         weight.size(2),
+         weight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      columns[g].addmm_(
+          weight[g].flatten(1).transpose(0, 1),
+          grad_output[b][g].flatten(1),
+          0.0f,
+          1.0f);
+    }
+
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    weight = weight.view(
+        {weight.size(0) * weight.size(1),
+         weight.size(2),
+         weight.size(3),
+         weight.size(4)});
+
+    // gradient w.r.t. input coordinate data
+    modulated_deformable_col2im_coord_cuda(
+        columns,
+        input[b],
+        offset[b],
+        mask[b],
+        1,
+        channels,
+        height,
+        width,
+        height_out,
+        width_out,
+        kernel_h,
+        kernel_w,
+        pad_h,
+        pad_w,
+        stride_h,
+        stride_w,
+        dilation_h,
+        dilation_w,
+        deformable_group,
+        grad_offset[b],
+        grad_mask[b]);
+    // gradient w.r.t. input data
+    modulated_deformable_col2im_cuda(
+        columns,
+        offset[b],
+        mask[b],
+        1,
+        channels,
+        height,
+        width,
+        height_out,
+        width_out,
+        kernel_h,
+        kernel_w,
+        pad_h,
+        pad_w,
+        stride_h,
+        stride_w,
+        dilation_h,
+        dilation_w,
+        deformable_group,
+        grad_input[b]);
+
+    // gradient w.r.t. weight, dWeight should accumulate across the batch and
+    // group
+    modulated_deformable_im2col_cuda(
+        input[b],
+        offset[b],
+        mask[b],
+        1,
+        channels,
+        height,
+        width,
+        height_out,
+        width_out,
+        kernel_h,
+        kernel_w,
+        pad_h,
+        pad_w,
+        stride_h,
+        stride_w,
+        dilation_h,
+        dilation_w,
+        deformable_group,
+        columns);
+
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    grad_weight = grad_weight.view(
+        {group,
+         grad_weight.size(0) / group,
+         grad_weight.size(1),
+         grad_weight.size(2),
+         grad_weight.size(3)});
+    if (with_bias)
+      grad_bias = grad_bias.view({group, grad_bias.size(0) / group});
+
+    for (int g = 0; g < group; g++) {
+      grad_weight[g] =
+          grad_weight[g]
+              .flatten(1)
+              .addmm_(grad_output[b][g].flatten(1), columns[g].transpose(0, 1))
+              .view_as(grad_weight[g]);
+      if (with_bias) {
+        grad_bias[g] =
+            grad_bias[g]
+                .view({-1, 1})
+                .addmm_(grad_output[b][g].flatten(1), ones.view({-1, 1}))
+                .view(-1);
+      }
+    }
+
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    grad_weight = grad_weight.view(
+        {grad_weight.size(0) * grad_weight.size(1),
+         grad_weight.size(2),
+         grad_weight.size(3),
+         grad_weight.size(4)});
+    if (with_bias)
+      grad_bias = grad_bias.view({grad_bias.size(0) * grad_bias.size(1)});
+  }
+  grad_output = grad_output.view(
+      {grad_output.size(0) * grad_output.size(1),
+       grad_output.size(2),
+       grad_output.size(3),
+       grad_output.size(4)});
+}
+
+} // namespace detectron2
--- a/detectron2/layers/csrc/deformable/deform_conv_cuda_kernel.cu
+++ b/detectron2/layers/csrc/deformable/deform_conv_cuda_kernel.cu
+// Copyright (c) Facebook, Inc. and its affiliates.
+
+// modified from
+// https://github.com/open-mmlab/mmdetection/blob/master/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu
+// Original license: Apache 2.0
+// clang-format off
+
+// modify from
+// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu
+
+/*!
+ ******************* BEGIN Caffe Copyright Notice and Disclaimer *****************
+ *
+ * COPYRIGHT
+ *
+ * All contributions by the University of California:
+ * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
+ * All rights reserved.
+ *
+ * All other contributions:
+ * Copyright (c) 2014-2017, the respective contributors
+ * All rights reserved.
+ *
+ * Caffe uses a shared copyright model: each contributor holds copyright over
+ * their contributions to Caffe. The project versioning records all such
+ * contribution and copyright details. If a contributor wants to further mark
+ * their specific copyright on a particular contribution, they should indicate
+ * their copyright solely in the commit message of the change when it is
+ * committed.
+ *
+ * LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ *FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ *DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ *SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ *OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * CONTRIBUTION AGREEMENT
+ *
+ * By contributing to the BVLC/caffe repository through pull-request, comment,
+ * or otherwise, the contributor releases their content to the
+ * license and copyright terms herein.
+ *
+ ***************** END Caffe Copyright Notice and Disclaimer *********************
+ *
+ * Copyright (c) 2018 Microsoft
+ * Licensed under The MIT License [see LICENSE for details]
+ * \file modulated_deformable_im2col.cuh
+ * \brief Function definitions of converting an image to
+ * column matrix based on kernel, padding, dilation, and offset.
+ * These functions are mainly used in deformable convolution operators.
+ * \ref: https://arxiv.org/abs/1703.06211
+ * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng
+ */
+
+#include <ATen/ATen.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include <THC/THCAtomics.cuh>
+
+using namespace at;
+
+#define CUDA_KERNEL_LOOP(i, n)                                 \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+
+namespace {
+
+const int CUDA_NUM_THREADS = 1024;
+const int kMaxGridNum = 65535;
+
+inline int GET_BLOCKS(const int N) {
+  return std::min(kMaxGridNum, (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS);
+}
+
+}
+
+template <typename scalar_t>
+__device__ scalar_t deformable_im2col_bilinear(
+    const scalar_t* bottom_data,
+    const int data_width,
+    const int height,
+    const int width,
+    scalar_t h,
+    scalar_t w) {
+  int h_low = floor(h);
+  int w_low = floor(w);
+  int h_high = h_low + 1;
+  int w_high = w_low + 1;
+
+  scalar_t lh = h - h_low;
+  scalar_t lw = w - w_low;
+  scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+    v1 = bottom_data[h_low * data_width + w_low];
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+    v2 = bottom_data[h_low * data_width + w_high];
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+    v3 = bottom_data[h_high * data_width + w_low];
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+    v4 = bottom_data[h_high * data_width + w_high];
+
+  scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+template <typename scalar_t>
+__device__ scalar_t get_gradient_weight(
+    scalar_t argmax_h,
+    scalar_t argmax_w,
+    const int h,
+    const int w,
+    const int height,
+    const int width) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  scalar_t weight = 0;
+  if (h == argmax_h_low && w == argmax_w_low)
+    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+  if (h == argmax_h_low && w == argmax_w_high)
+    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+  if (h == argmax_h_high && w == argmax_w_low)
+    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+  if (h == argmax_h_high && w == argmax_w_high)
+    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+  return weight;
+}
+
+template <typename scalar_t>
+__device__ scalar_t get_coordinate_weight(
+    scalar_t argmax_h,
+    scalar_t argmax_w,
+    const int height,
+    const int width,
+    const scalar_t* im_data,
+    const int data_width,
+    const int bp_dir) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  scalar_t weight = 0;
+
+  if (bp_dir == 0) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_w_low + 1 - argmax_w) *
+          im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += -1 * (argmax_w - argmax_w_low) *
+          im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += (argmax_w_low + 1 - argmax_w) *
+          im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_w - argmax_w_low) *
+          im_data[argmax_h_high * data_width + argmax_w_high];
+  } else if (bp_dir == 1) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h_low + 1 - argmax_h) *
+          im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += (argmax_h_low + 1 - argmax_h) *
+          im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h - argmax_h_low) *
+          im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_h - argmax_h_low) *
+          im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+
+  return weight;
+}
+
+template <typename scalar_t>
+__global__ void deformable_im2col_gpu_kernel(
+    const int n,
+    const scalar_t* data_im,
+    const scalar_t* data_offset,
+    const int height,
+    const int width,
+    const int kernel_h,
+    const int kernel_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int channel_per_deformable_group,
+    const int batch_size,
+    const int num_channels,
+    const int deformable_group,
+    const int height_col,
+    const int width_col,
+    scalar_t* data_col) {
+  CUDA_KERNEL_LOOP(index, n) {
+    // index index of output matrix
+    const int w_col = index % width_col;
+    const int h_col = (index / width_col) % height_col;
+    const int b_col = (index / width_col / height_col) % batch_size;
+    const int c_im = (index / width_col / height_col) / batch_size;
+    const int c_col = c_im * kernel_h * kernel_w;
+
+    // compute deformable group index
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+    scalar_t* data_col_ptr = data_col +
+        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+    // const scalar_t* data_im_ptr = data_im + ((b_col * num_channels + c_im) *
+    // height + h_in) * width + w_in;
+    const scalar_t* data_im_ptr =
+        data_im + (b_col * num_channels + c_im) * height * width;
+    const scalar_t* data_offset_ptr = data_offset +
+        (b_col * deformable_group + deformable_group_index) * 2 * kernel_h *
+            kernel_w * height_col * width_col;
+
+    for (int i = 0; i < kernel_h; ++i) {
+      for (int j = 0; j < kernel_w; ++j) {
+        const int data_offset_h_ptr =
+            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+        const int data_offset_w_ptr =
+            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
+            w_col;
+        const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+        const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+        scalar_t val = static_cast<scalar_t>(0);
+        const scalar_t h_im = h_in + i * dilation_h + offset_h;
+        const scalar_t w_im = w_in + j * dilation_w + offset_w;
+        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) {
+          // const scalar_t map_h = i * dilation_h + offset_h;
+          // const scalar_t map_w = j * dilation_w + offset_w;
+          // const int cur_height = height - h_in;
+          // const int cur_width = width - w_in;
+          // val = deformable_im2col_bilinear(data_im_ptr, width, cur_height,
+          // cur_width, map_h, map_w);
+          val = deformable_im2col_bilinear(
+              data_im_ptr, width, height, width, h_im, w_im);
+        }
+        *data_col_ptr = val;
+        data_col_ptr += batch_size * height_col * width_col;
+      }
+    }
+  }
+}
+
+
+template <typename scalar_t>
+__global__ void deformable_col2im_gpu_kernel(
+    const int n,
+    const scalar_t* data_col,
+    const scalar_t* data_offset,
+    const int channels,
+    const int height,
+    const int width,
+    const int kernel_h,
+    const int kernel_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int channel_per_deformable_group,
+    const int batch_size,
+    const int deformable_group,
+    const int height_col,
+    const int width_col,
+    scalar_t* grad_im) {
+  CUDA_KERNEL_LOOP(index, n) {
+    const int j = (index / width_col / height_col / batch_size) % kernel_w;
+    const int i =
+        (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
+    const int c =
+        index / width_col / height_col / batch_size / kernel_w / kernel_h;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / channel_per_deformable_group;
+
+    int w_out = index % width_col;
+    int h_out = (index / width_col) % height_col;
+    int b = (index / width_col / height_col) % batch_size;
+    int w_in = w_out * stride_w - pad_w;
+    int h_in = h_out * stride_h - pad_h;
+
+    const scalar_t* data_offset_ptr = data_offset +
+        (b * deformable_group + deformable_group_index) * 2 * kernel_h *
+            kernel_w * height_col * width_col;
+    const int data_offset_h_ptr =
+        ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
+    const int data_offset_w_ptr =
+        ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
+    const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+    const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+    const scalar_t cur_inv_h_data = h_in + i * dilation_h + offset_h;
+    const scalar_t cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+    const scalar_t cur_top_grad = data_col[index];
+    const int cur_h = (int)cur_inv_h_data;
+    const int cur_w = (int)cur_inv_w_data;
+    for (int dy = -2; dy <= 2; dy++) {
+      for (int dx = -2; dx <= 2; dx++) {
+        if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
+            cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+            abs(cur_inv_w_data - (cur_w + dx)) < 1) {
+          int cur_bottom_grad_pos =
+              ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
+          scalar_t weight = get_gradient_weight(
+              cur_inv_h_data,
+              cur_inv_w_data,
+              cur_h + dy,
+              cur_w + dx,
+              height,
+              width);
+          atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
+        }
+      }
+    }
+  }
+}
+
+
+template <typename scalar_t>
+__global__ void deformable_col2im_coord_gpu_kernel(
+    const int n,
+    const scalar_t* data_col,
+    const scalar_t* data_im,
+    const scalar_t* data_offset,
+    const int channels,
+    const int height,
+    const int width,
+    const int kernel_h,
+    const int kernel_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int channel_per_deformable_group,
+    const int batch_size,
+    const int offset_channels,
+    const int deformable_group,
+    const int height_col,
+    const int width_col,
+    scalar_t* grad_offset) {
+  CUDA_KERNEL_LOOP(index, n) {
+    scalar_t val = 0;
+    int w = index % width_col;
+    int h = (index / width_col) % height_col;
+    int c = (index / width_col / height_col) % offset_channels;
+    int b = (index / width_col / height_col) / offset_channels;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
+    const int col_step = kernel_h * kernel_w;
+    int cnt = 0;
+    const scalar_t* data_col_ptr = data_col +
+        deformable_group_index * channel_per_deformable_group * batch_size *
+            width_col * height_col;
+    const scalar_t* data_im_ptr = data_im +
+        (b * deformable_group + deformable_group_index) *
+            channel_per_deformable_group / kernel_h / kernel_w * height * width;
+    const scalar_t* data_offset_ptr = data_offset +
+        (b * deformable_group + deformable_group_index) * 2 * kernel_h *
+            kernel_w * height_col * width_col;
+
+    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+
+    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;
+         col_c += col_step) {
+      const int col_pos =
+          (((col_c * batch_size + b) * height_col) + h) * width_col + w;
+      const int bp_dir = offset_c % 2;
+
+      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
+      int i =
+          (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
+      int w_out = col_pos % width_col;
+      int h_out = (col_pos / width_col) % height_col;
+      int w_in = w_out * stride_w - pad_w;
+      int h_in = h_out * stride_h - pad_h;
+      const int data_offset_h_ptr =
+          (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
+      const int data_offset_w_ptr =
+          (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
+           w_out);
+      const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+      const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+      scalar_t inv_h = h_in + i * dilation_h + offset_h;
+      scalar_t inv_w = w_in + j * dilation_w + offset_w;
+      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) {
+        inv_h = inv_w = -2;
+      }
+      const scalar_t weight = get_coordinate_weight(
+          inv_h,
+          inv_w,
+          height,
+          width,
+          data_im_ptr + cnt * height * width,
+          width,
+          bp_dir);
+      val += weight * data_col_ptr[col_pos];
+      cnt += 1;
+    }
+
+    grad_offset[index] = val;
+  }
+}
+
+
+namespace detectron2 {
+
+void deformable_im2col(
+    const at::Tensor data_im,
+    const at::Tensor data_offset,
+    const int channels,
+    const int height,
+    const int width,
+    const int ksize_h,
+    const int ksize_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int parallel_imgs,
+    const int deformable_group,
+    at::Tensor data_col) {
+  // num_axes should be smaller than block size
+  // todo: check parallel_imgs is correctly passed in
+  int height_col =
+      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col =
+      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels = channels * height_col * width_col * parallel_imgs;
+  int channel_per_deformable_group = channels / deformable_group;
+
+  at::cuda::CUDAGuard device_guard(data_im.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_im.scalar_type(), "deformable_im2col_gpu", ([&] {
+        const scalar_t* data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t* data_offset_ = data_offset.data_ptr<scalar_t>();
+        scalar_t* data_col_ = data_col.data_ptr<scalar_t>();
+
+        deformable_im2col_gpu_kernel<<<
+            GET_BLOCKS(num_kernels),
+            CUDA_NUM_THREADS,
+            0,
+            stream>>>(
+            num_kernels,
+            data_im_,
+            data_offset_,
+            height,
+            width,
+            ksize_h,
+            ksize_w,
+            pad_h,
+            pad_w,
+            stride_h,
+            stride_w,
+            dilation_h,
+            dilation_w,
+            channel_per_deformable_group,
+            parallel_imgs,
+            channels,
+            deformable_group,
+            height_col,
+            width_col,
+            data_col_);
+      }));
+
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf("error in deformable_im2col: %s\n", cudaGetErrorString(err));
+  }
+}
+
+
+void deformable_col2im(
+    const at::Tensor data_col,
+    const at::Tensor data_offset,
+    const int channels,
+    const int height,
+    const int width,
+    const int ksize_h,
+    const int ksize_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int parallel_imgs,
+    const int deformable_group,
+    at::Tensor grad_im) {
+  // todo: make sure parallel_imgs is passed in correctly
+  int height_col =
+      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col =
+      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels =
+      channels * ksize_h * ksize_w * height_col * width_col * parallel_imgs;
+  int channel_per_deformable_group = channels / deformable_group;
+
+  at::cuda::CUDAGuard device_guard(data_col.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "deformable_col2im_gpu", ([&] {
+        const scalar_t* data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t* data_offset_ = data_offset.data_ptr<scalar_t>();
+        scalar_t* grad_im_ = grad_im.data_ptr<scalar_t>();
+
+        deformable_col2im_gpu_kernel<<<
+            GET_BLOCKS(num_kernels),
+            CUDA_NUM_THREADS,
+            0,
+            stream>>>(
+            num_kernels,
+            data_col_,
+            data_offset_,
+            channels,
+            height,
+            width,
+            ksize_h,
+            ksize_w,
+            pad_h,
+            pad_w,
+            stride_h,
+            stride_w,
+            dilation_h,
+            dilation_w,
+            channel_per_deformable_group,
+            parallel_imgs,
+            deformable_group,
+            height_col,
+            width_col,
+            grad_im_);
+      }));
+
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf("error in deformable_col2im: %s\n", cudaGetErrorString(err));
+  }
+}
+
+
+void deformable_col2im_coord(
+    const at::Tensor data_col,
+    const at::Tensor data_im,
+    const at::Tensor data_offset,
+    const int channels,
+    const int height,
+    const int width,
+    const int ksize_h,
+    const int ksize_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int parallel_imgs,
+    const int deformable_group,
+    at::Tensor grad_offset) {
+  int height_col =
+      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col =
+      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels = height_col * width_col * 2 * ksize_h * ksize_w *
+      deformable_group * parallel_imgs;
+  int channel_per_deformable_group =
+      channels * ksize_h * ksize_w / deformable_group;
+
+  at::cuda::CUDAGuard device_guard(data_col.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "deformable_col2im_coord_gpu", ([&] {
+        const scalar_t* data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t* data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t* data_offset_ = data_offset.data_ptr<scalar_t>();
+        scalar_t* grad_offset_ = grad_offset.data_ptr<scalar_t>();
+
+        deformable_col2im_coord_gpu_kernel<<<
+            GET_BLOCKS(num_kernels),
+            CUDA_NUM_THREADS,
+            0,
+            stream>>>(
+            num_kernels,
+            data_col_,
+            data_im_,
+            data_offset_,
+            channels,
+            height,
+            width,
+            ksize_h,
+            ksize_w,
+            pad_h,
+            pad_w,
+            stride_h,
+            stride_w,
+            dilation_h,
+            dilation_w,
+            channel_per_deformable_group,
+            parallel_imgs,
+            2 * ksize_h * ksize_w * deformable_group,
+            deformable_group,
+            height_col,
+            width_col,
+            grad_offset_);
+      }));
+}
+
+} // namespace detectron2
+
+
+template <typename scalar_t>
+__device__ scalar_t dmcn_im2col_bilinear(
+    const scalar_t* bottom_data,
+    const int data_width,
+    const int height,
+    const int width,
+    scalar_t h,
+    scalar_t w) {
+  int h_low = floor(h);
+  int w_low = floor(w);
+  int h_high = h_low + 1;
+  int w_high = w_low + 1;
+
+  scalar_t lh = h - h_low;
+  scalar_t lw = w - w_low;
+  scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+    v1 = bottom_data[h_low * data_width + w_low];
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+    v2 = bottom_data[h_low * data_width + w_high];
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+    v3 = bottom_data[h_high * data_width + w_low];
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+    v4 = bottom_data[h_high * data_width + w_high];
+
+  scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+template <typename scalar_t>
+__device__ scalar_t dmcn_get_gradient_weight(
+    scalar_t argmax_h,
+    scalar_t argmax_w,
+    const int h,
+    const int w,
+    const int height,
+    const int width) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  scalar_t weight = 0;
+  if (h == argmax_h_low && w == argmax_w_low)
+    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+  if (h == argmax_h_low && w == argmax_w_high)
+    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+  if (h == argmax_h_high && w == argmax_w_low)
+    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+  if (h == argmax_h_high && w == argmax_w_high)
+    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+  return weight;
+}
+
+template <typename scalar_t>
+__device__ scalar_t dmcn_get_coordinate_weight(
+    scalar_t argmax_h,
+    scalar_t argmax_w,
+    const int height,
+    const int width,
+    const scalar_t* im_data,
+    const int data_width,
+    const int bp_dir) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  scalar_t weight = 0;
+
+  if (bp_dir == 0) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_w_low + 1 - argmax_w) *
+          im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += -1 * (argmax_w - argmax_w_low) *
+          im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += (argmax_w_low + 1 - argmax_w) *
+          im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_w - argmax_w_low) *
+          im_data[argmax_h_high * data_width + argmax_w_high];
+  } else if (bp_dir == 1) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h_low + 1 - argmax_h) *
+          im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += (argmax_h_low + 1 - argmax_h) *
+          im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h - argmax_h_low) *
+          im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_h - argmax_h_low) *
+          im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+
+  return weight;
+}
+
+template <typename scalar_t>
+__global__ void modulated_deformable_im2col_gpu_kernel(
+    const int n,
+    const scalar_t* data_im,
+    const scalar_t* data_offset,
+    const scalar_t* data_mask,
+    const int height,
+    const int width,
+    const int kernel_h,
+    const int kernel_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int channel_per_deformable_group,
+    const int batch_size,
+    const int num_channels,
+    const int deformable_group,
+    const int height_col,
+    const int width_col,
+    scalar_t* data_col) {
+  CUDA_KERNEL_LOOP(index, n) {
+    // index index of output matrix
+    const int w_col = index % width_col;
+    const int h_col = (index / width_col) % height_col;
+    const int b_col = (index / width_col / height_col) % batch_size;
+    const int c_im = (index / width_col / height_col) / batch_size;
+    const int c_col = c_im * kernel_h * kernel_w;
+
+    // compute deformable group index
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+
+    scalar_t* data_col_ptr = data_col +
+        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+    // const float* data_im_ptr = data_im + ((b_col * num_channels + c_im) *
+    // height + h_in) * width + w_in;
+    const scalar_t* data_im_ptr =
+        data_im + (b_col * num_channels + c_im) * height * width;
+    const scalar_t* data_offset_ptr = data_offset +
+        (b_col * deformable_group + deformable_group_index) * 2 * kernel_h *
+            kernel_w * height_col * width_col;
+
+    const scalar_t* data_mask_ptr = data_mask +
+        (b_col * deformable_group + deformable_group_index) * kernel_h *
+            kernel_w * height_col * width_col;
+
+    for (int i = 0; i < kernel_h; ++i) {
+      for (int j = 0; j < kernel_w; ++j) {
+        const int data_offset_h_ptr =
+            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+        const int data_offset_w_ptr =
+            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
+            w_col;
+        const int data_mask_hw_ptr =
+            ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
+        const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+        const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+        const scalar_t mask = data_mask_ptr[data_mask_hw_ptr];
+        scalar_t val = static_cast<scalar_t>(0);
+        const scalar_t h_im = h_in + i * dilation_h + offset_h;
+        const scalar_t w_im = w_in + j * dilation_w + offset_w;
+        // if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {
+        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) {
+          // const float map_h = i * dilation_h + offset_h;
+          // const float map_w = j * dilation_w + offset_w;
+          // const int cur_height = height - h_in;
+          // const int cur_width = width - w_in;
+          // val = dmcn_im2col_bilinear(data_im_ptr, width, cur_height,
+          // cur_width, map_h, map_w);
+          val = dmcn_im2col_bilinear(
+              data_im_ptr, width, height, width, h_im, w_im);
+        }
+        *data_col_ptr = val * mask;
+        data_col_ptr += batch_size * height_col * width_col;
+        // data_col_ptr += height_col * width_col;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void modulated_deformable_col2im_gpu_kernel(
+    const int n,
+    const scalar_t* data_col,
+    const scalar_t* data_offset,
+    const scalar_t* data_mask,
+    const int channels,
+    const int height,
+    const int width,
+    const int kernel_h,
+    const int kernel_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int channel_per_deformable_group,
+    const int batch_size,
+    const int deformable_group,
+    const int height_col,
+    const int width_col,
+    scalar_t* grad_im) {
+  CUDA_KERNEL_LOOP(index, n) {
+    const int j = (index / width_col / height_col / batch_size) % kernel_w;
+    const int i =
+        (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
+    const int c =
+        index / width_col / height_col / batch_size / kernel_w / kernel_h;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / channel_per_deformable_group;
+
+    int w_out = index % width_col;
+    int h_out = (index / width_col) % height_col;
+    int b = (index / width_col / height_col) % batch_size;
+    int w_in = w_out * stride_w - pad_w;
+    int h_in = h_out * stride_h - pad_h;
+
+    const scalar_t* data_offset_ptr = data_offset +
+        (b * deformable_group + deformable_group_index) * 2 * kernel_h *
+            kernel_w * height_col * width_col;
+    const scalar_t* data_mask_ptr = data_mask +
+        (b * deformable_group + deformable_group_index) * kernel_h * kernel_w *
+            height_col * width_col;
+    const int data_offset_h_ptr =
+        ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
+    const int data_offset_w_ptr =
+        ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
+    const int data_mask_hw_ptr =
+        ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
+    const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+    const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+    const scalar_t mask = data_mask_ptr[data_mask_hw_ptr];
+    const scalar_t cur_inv_h_data = h_in + i * dilation_h + offset_h;
+    const scalar_t cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+    const scalar_t cur_top_grad = data_col[index] * mask;
+    const int cur_h = (int)cur_inv_h_data;
+    const int cur_w = (int)cur_inv_w_data;
+    for (int dy = -2; dy <= 2; dy++) {
+      for (int dx = -2; dx <= 2; dx++) {
+        if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
+            cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+            abs(cur_inv_w_data - (cur_w + dx)) < 1) {
+          int cur_bottom_grad_pos =
+              ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
+          scalar_t weight = dmcn_get_gradient_weight(
+              cur_inv_h_data,
+              cur_inv_w_data,
+              cur_h + dy,
+              cur_w + dx,
+              height,
+              width);
+          atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
+        }
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void modulated_deformable_col2im_coord_gpu_kernel(
+    const int n,
+    const scalar_t* data_col,
+    const scalar_t* data_im,
+    const scalar_t* data_offset,
+    const scalar_t* data_mask,
+    const int channels,
+    const int height,
+    const int width,
+    const int kernel_h,
+    const int kernel_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int channel_per_deformable_group,
+    const int batch_size,
+    const int offset_channels,
+    const int deformable_group,
+    const int height_col,
+    const int width_col,
+    scalar_t* grad_offset,
+    scalar_t* grad_mask) {
+  CUDA_KERNEL_LOOP(index, n) {
+    scalar_t val = 0, mval = 0;
+    int w = index % width_col;
+    int h = (index / width_col) % height_col;
+    int c = (index / width_col / height_col) % offset_channels;
+    int b = (index / width_col / height_col) / offset_channels;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
+    const int col_step = kernel_h * kernel_w;
+    int cnt = 0;
+    const scalar_t* data_col_ptr = data_col +
+        deformable_group_index * channel_per_deformable_group * batch_size *
+            width_col * height_col;
+    const scalar_t* data_im_ptr = data_im +
+        (b * deformable_group + deformable_group_index) *
+            channel_per_deformable_group / kernel_h / kernel_w * height * width;
+    const scalar_t* data_offset_ptr = data_offset +
+        (b * deformable_group + deformable_group_index) * 2 * kernel_h *
+            kernel_w * height_col * width_col;
+    const scalar_t* data_mask_ptr = data_mask +
+        (b * deformable_group + deformable_group_index) * kernel_h * kernel_w *
+            height_col * width_col;
+
+    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+
+    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;
+         col_c += col_step) {
+      const int col_pos =
+          (((col_c * batch_size + b) * height_col) + h) * width_col + w;
+      const int bp_dir = offset_c % 2;
+
+      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
+      int i =
+          (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
+      int w_out = col_pos % width_col;
+      int h_out = (col_pos / width_col) % height_col;
+      int w_in = w_out * stride_w - pad_w;
+      int h_in = h_out * stride_h - pad_h;
+      const int data_offset_h_ptr =
+          (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
+      const int data_offset_w_ptr =
+          (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
+           w_out);
+      const int data_mask_hw_ptr =
+          (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
+      const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+      const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+      const scalar_t mask = data_mask_ptr[data_mask_hw_ptr];
+      scalar_t inv_h = h_in + i * dilation_h + offset_h;
+      scalar_t inv_w = w_in + j * dilation_w + offset_w;
+      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) {
+        inv_h = inv_w = -2;
+      } else {
+        mval += data_col_ptr[col_pos] *
+            dmcn_im2col_bilinear(
+                    data_im_ptr + cnt * height * width,
+                    width,
+                    height,
+                    width,
+                    inv_h,
+                    inv_w);
+      }
+      const scalar_t weight = dmcn_get_coordinate_weight(
+          inv_h,
+          inv_w,
+          height,
+          width,
+          data_im_ptr + cnt * height * width,
+          width,
+          bp_dir);
+      val += weight * data_col_ptr[col_pos] * mask;
+      cnt += 1;
+    }
+    // KERNEL_ASSIGN(grad_offset[index], offset_req, val);
+    grad_offset[index] = val;
+    if (offset_c % 2 == 0)
+      // KERNEL_ASSIGN(grad_mask[(((b * deformable_group +
+      // deformable_group_index) * kernel_h * kernel_w + offset_c / 2) *
+      // height_col + h) * width_col + w], mask_req, mval);
+      grad_mask
+          [(((b * deformable_group + deformable_group_index) * kernel_h *
+                 kernel_w +
+             offset_c / 2) *
+                height_col +
+            h) *
+               width_col +
+           w] = mval;
+  }
+}
+
+
+namespace detectron2 {
+
+void modulated_deformable_im2col_cuda(
+    const at::Tensor data_im,
+    const at::Tensor data_offset,
+    const at::Tensor data_mask,
+    const int batch_size,
+    const int channels,
+    const int height_im,
+    const int width_im,
+    const int height_col,
+    const int width_col,
+    const int kernel_h,
+    const int kenerl_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int deformable_group,
+    at::Tensor data_col) {
+  // num_axes should be smaller than block size
+  const int channel_per_deformable_group = channels / deformable_group;
+  const int num_kernels = channels * batch_size * height_col * width_col;
+
+  at::cuda::CUDAGuard device_guard(data_im.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_im.scalar_type(), "modulated_deformable_im2col_gpu", ([&] {
+        const scalar_t* data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t* data_offset_ = data_offset.data_ptr<scalar_t>();
+        const scalar_t* data_mask_ = data_mask.data_ptr<scalar_t>();
+        scalar_t* data_col_ = data_col.data_ptr<scalar_t>();
+
+        modulated_deformable_im2col_gpu_kernel<<<
+            GET_BLOCKS(num_kernels),
+            CUDA_NUM_THREADS,
+            0,
+            stream>>>(
+            num_kernels,
+            data_im_,
+            data_offset_,
+            data_mask_,
+            height_im,
+            width_im,
+            kernel_h,
+            kenerl_w,
+            pad_h,
+            pad_w,
+            stride_h,
+            stride_w,
+            dilation_h,
+            dilation_w,
+            channel_per_deformable_group,
+            batch_size,
+            channels,
+            deformable_group,
+            height_col,
+            width_col,
+            data_col_);
+      }));
+
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf(
+        "error in modulated_deformable_im2col_cuda: %s\n",
+        cudaGetErrorString(err));
+  }
+}
+
+void modulated_deformable_col2im_cuda(
+    const at::Tensor data_col,
+    const at::Tensor data_offset,
+    const at::Tensor data_mask,
+    const int batch_size,
+    const int channels,
+    const int height_im,
+    const int width_im,
+    const int height_col,
+    const int width_col,
+    const int kernel_h,
+    const int kernel_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int deformable_group,
+    at::Tensor grad_im) {
+  const int channel_per_deformable_group = channels / deformable_group;
+  const int num_kernels =
+      channels * kernel_h * kernel_w * batch_size * height_col * width_col;
+
+  at::cuda::CUDAGuard device_guard(data_col.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "modulated_deformable_col2im_gpu", ([&] {
+        const scalar_t* data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t* data_offset_ = data_offset.data_ptr<scalar_t>();
+        const scalar_t* data_mask_ = data_mask.data_ptr<scalar_t>();
+        scalar_t* grad_im_ = grad_im.data_ptr<scalar_t>();
+
+        modulated_deformable_col2im_gpu_kernel<<<
+            GET_BLOCKS(num_kernels),
+            CUDA_NUM_THREADS,
+            0,
+            stream>>>(
+            num_kernels,
+            data_col_,
+            data_offset_,
+            data_mask_,
+            channels,
+            height_im,
+            width_im,
+            kernel_h,
+            kernel_w,
+            pad_h,
+            pad_w,
+            stride_h,
+            stride_w,
+            dilation_h,
+            dilation_w,
+            channel_per_deformable_group,
+            batch_size,
+            deformable_group,
+            height_col,
+            width_col,
+            grad_im_);
+      }));
+
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf(
+        "error in modulated_deformable_col2im_cuda: %s\n",
+        cudaGetErrorString(err));
+  }
+}
+
+void modulated_deformable_col2im_coord_cuda(
+    const at::Tensor data_col,
+    const at::Tensor data_im,
+    const at::Tensor data_offset,
+    const at::Tensor data_mask,
+    const int batch_size,
+    const int channels,
+    const int height_im,
+    const int width_im,
+    const int height_col,
+    const int width_col,
+    const int kernel_h,
+    const int kernel_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int deformable_group,
+    at::Tensor grad_offset,
+    at::Tensor grad_mask) {
+  const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h *
+      kernel_w * deformable_group;
+  const int channel_per_deformable_group =
+      channels * kernel_h * kernel_w / deformable_group;
+
+  at::cuda::CUDAGuard device_guard(data_col.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "modulated_deformable_col2im_coord_gpu", ([&] {
+        const scalar_t* data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t* data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t* data_offset_ = data_offset.data_ptr<scalar_t>();
+        const scalar_t* data_mask_ = data_mask.data_ptr<scalar_t>();
+        scalar_t* grad_offset_ = grad_offset.data_ptr<scalar_t>();
+        scalar_t* grad_mask_ = grad_mask.data_ptr<scalar_t>();
+
+        modulated_deformable_col2im_coord_gpu_kernel<<<
+            GET_BLOCKS(num_kernels),
+            CUDA_NUM_THREADS,
+            0,
+            stream>>>(
+            num_kernels,
+            data_col_,
+            data_im_,
+            data_offset_,
+            data_mask_,
+            channels,
+            height_im,
+            width_im,
+            kernel_h,
+            kernel_w,
+            pad_h,
+            pad_w,
+            stride_h,
+            stride_w,
+            dilation_h,
+            dilation_w,
+            channel_per_deformable_group,
+            batch_size,
+            2 * kernel_h * kernel_w * deformable_group,
+            deformable_group,
+            height_col,
+            width_col,
+            grad_offset_,
+            grad_mask_);
+      }));
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf(
+        "error in modulated_deformable_col2im_coord_cuda: %s\n",
+        cudaGetErrorString(err));
+  }
+}
+
+} // namespace detectron2