Commit 3144257c authored by mashun1's avatar mashun1
Browse files

catvton

parents
# Copyright (c) Facebook, Inc. and its affiliates.
import os
import torch
from detectron2.utils.file_io import PathManager
from .torchscript_patch import freeze_training_mode, patch_instances
__all__ = ["scripting_with_instances", "dump_torchscript_IR"]
def scripting_with_instances(model, fields):
"""
Run :func:`torch.jit.script` on a model that uses the :class:`Instances` class. Since
attributes of :class:`Instances` are "dynamically" added in eager mode,it is difficult
for scripting to support it out of the box. This function is made to support scripting
a model that uses :class:`Instances`. It does the following:
1. Create a scriptable ``new_Instances`` class which behaves similarly to ``Instances``,
but with all attributes been "static".
The attributes need to be statically declared in the ``fields`` argument.
2. Register ``new_Instances``, and force scripting compiler to
use it when trying to compile ``Instances``.
After this function, the process will be reverted. User should be able to script another model
using different fields.
Example:
Assume that ``Instances`` in the model consist of two attributes named
``proposal_boxes`` and ``objectness_logits`` with type :class:`Boxes` and
:class:`Tensor` respectively during inference. You can call this function like:
::
fields = {"proposal_boxes": Boxes, "objectness_logits": torch.Tensor}
torchscipt_model = scripting_with_instances(model, fields)
Note:
It only support models in evaluation mode.
Args:
model (nn.Module): The input model to be exported by scripting.
fields (Dict[str, type]): Attribute names and corresponding type that
``Instances`` will use in the model. Note that all attributes used in ``Instances``
need to be added, regardless of whether they are inputs/outputs of the model.
Data type not defined in detectron2 is not supported for now.
Returns:
torch.jit.ScriptModule: the model in torchscript format
"""
assert (
not model.training
), "Currently we only support exporting models in evaluation mode to torchscript"
with freeze_training_mode(model), patch_instances(fields):
scripted_model = torch.jit.script(model)
return scripted_model
# alias for old name
export_torchscript_with_instances = scripting_with_instances
def dump_torchscript_IR(model, dir):
"""
Dump IR of a TracedModule/ScriptModule/Function in various format (code, graph,
inlined graph). Useful for debugging.
Args:
model (TracedModule/ScriptModule/ScriptFUnction): traced or scripted module
dir (str): output directory to dump files.
"""
dir = os.path.expanduser(dir)
PathManager.mkdirs(dir)
def _get_script_mod(mod):
if isinstance(mod, torch.jit.TracedModule):
return mod._actual_script_module
return mod
# Dump pretty-printed code: https://pytorch.org/docs/stable/jit.html#inspecting-code
with PathManager.open(os.path.join(dir, "model_ts_code.txt"), "w") as f:
def get_code(mod):
# Try a few ways to get code using private attributes.
try:
# This contains more information than just `mod.code`
return _get_script_mod(mod)._c.code
except AttributeError:
pass
try:
return mod.code
except AttributeError:
return None
def dump_code(prefix, mod):
code = get_code(mod)
name = prefix or "root model"
if code is None:
f.write(f"Could not found code for {name} (type={mod.original_name})\n")
f.write("\n")
else:
f.write(f"\nCode for {name}, type={mod.original_name}:\n")
f.write(code)
f.write("\n")
f.write("-" * 80)
for name, m in mod.named_children():
dump_code(prefix + "." + name, m)
if isinstance(model, torch.jit.ScriptFunction):
f.write(get_code(model))
else:
dump_code("", model)
def _get_graph(model):
try:
# Recursively dump IR of all modules
return _get_script_mod(model)._c.dump_to_str(True, False, False)
except AttributeError:
return model.graph.str()
with PathManager.open(os.path.join(dir, "model_ts_IR.txt"), "w") as f:
f.write(_get_graph(model))
# Dump IR of the entire graph (all submodules inlined)
with PathManager.open(os.path.join(dir, "model_ts_IR_inlined.txt"), "w") as f:
f.write(str(model.inlined_graph))
if not isinstance(model, torch.jit.ScriptFunction):
# Dump the model structure in pytorch style
with PathManager.open(os.path.join(dir, "model.txt"), "w") as f:
f.write(str(model))
# Copyright (c) Facebook, Inc. and its affiliates.
import os
import sys
import tempfile
from contextlib import ExitStack, contextmanager
from copy import deepcopy
from unittest import mock
import torch
from torch import nn
# need some explicit imports due to https://github.com/pytorch/pytorch/issues/38964
import detectron2 # noqa F401
from detectron2.structures import Boxes, Instances
from detectron2.utils.env import _import_file
_counter = 0
def _clear_jit_cache():
from torch.jit._recursive import concrete_type_store
from torch.jit._state import _jit_caching_layer
concrete_type_store.type_store.clear() # for modules
_jit_caching_layer.clear() # for free functions
def _add_instances_conversion_methods(newInstances):
"""
Add from_instances methods to the scripted Instances class.
"""
cls_name = newInstances.__name__
@torch.jit.unused
def from_instances(instances: Instances):
"""
Create scripted Instances from original Instances
"""
fields = instances.get_fields()
image_size = instances.image_size
ret = newInstances(image_size)
for name, val in fields.items():
assert hasattr(ret, f"_{name}"), f"No attribute named {name} in {cls_name}"
setattr(ret, name, deepcopy(val))
return ret
newInstances.from_instances = from_instances
@contextmanager
def patch_instances(fields):
"""
A contextmanager, under which the Instances class in detectron2 is replaced
by a statically-typed scriptable class, defined by `fields`.
See more in `scripting_with_instances`.
"""
with tempfile.TemporaryDirectory(prefix="detectron2") as dir, tempfile.NamedTemporaryFile(
mode="w", encoding="utf-8", suffix=".py", dir=dir, delete=False
) as f:
try:
# Objects that use Instances should not reuse previously-compiled
# results in cache, because `Instances` could be a new class each time.
_clear_jit_cache()
cls_name, s = _gen_instance_module(fields)
f.write(s)
f.flush()
f.close()
module = _import(f.name)
new_instances = getattr(module, cls_name)
_ = torch.jit.script(new_instances)
# let torchscript think Instances was scripted already
Instances.__torch_script_class__ = True
# let torchscript find new_instances when looking for the jit type of Instances
Instances._jit_override_qualname = torch._jit_internal._qualified_name(new_instances)
_add_instances_conversion_methods(new_instances)
yield new_instances
finally:
try:
del Instances.__torch_script_class__
del Instances._jit_override_qualname
except AttributeError:
pass
sys.modules.pop(module.__name__)
def _gen_instance_class(fields):
"""
Args:
fields (dict[name: type])
"""
class _FieldType:
def __init__(self, name, type_):
assert isinstance(name, str), f"Field name must be str, got {name}"
self.name = name
self.type_ = type_
self.annotation = f"{type_.__module__}.{type_.__name__}"
fields = [_FieldType(k, v) for k, v in fields.items()]
def indent(level, s):
return " " * 4 * level + s
lines = []
global _counter
_counter += 1
cls_name = "ScriptedInstances{}".format(_counter)
field_names = tuple(x.name for x in fields)
extra_args = ", ".join([f"{f.name}: Optional[{f.annotation}] = None" for f in fields])
lines.append(
f"""
class {cls_name}:
def __init__(self, image_size: Tuple[int, int], {extra_args}):
self.image_size = image_size
self._field_names = {field_names}
"""
)
for f in fields:
lines.append(
indent(2, f"self._{f.name} = torch.jit.annotate(Optional[{f.annotation}], {f.name})")
)
for f in fields:
lines.append(
f"""
@property
def {f.name}(self) -> {f.annotation}:
# has to use a local for type refinement
# https://pytorch.org/docs/stable/jit_language_reference.html#optional-type-refinement
t = self._{f.name}
assert t is not None, "{f.name} is None and cannot be accessed!"
return t
@{f.name}.setter
def {f.name}(self, value: {f.annotation}) -> None:
self._{f.name} = value
"""
)
# support method `__len__`
lines.append(
"""
def __len__(self) -> int:
"""
)
for f in fields:
lines.append(
f"""
t = self._{f.name}
if t is not None:
return len(t)
"""
)
lines.append(
"""
raise NotImplementedError("Empty Instances does not support __len__!")
"""
)
# support method `has`
lines.append(
"""
def has(self, name: str) -> bool:
"""
)
for f in fields:
lines.append(
f"""
if name == "{f.name}":
return self._{f.name} is not None
"""
)
lines.append(
"""
return False
"""
)
# support method `to`
none_args = ", None" * len(fields)
lines.append(
f"""
def to(self, device: torch.device) -> "{cls_name}":
ret = {cls_name}(self.image_size{none_args})
"""
)
for f in fields:
if hasattr(f.type_, "to"):
lines.append(
f"""
t = self._{f.name}
if t is not None:
ret._{f.name} = t.to(device)
"""
)
else:
# For now, ignore fields that cannot be moved to devices.
# Maybe can support other tensor-like classes (e.g. __torch_function__)
pass
lines.append(
"""
return ret
"""
)
# support method `getitem`
none_args = ", None" * len(fields)
lines.append(
f"""
def __getitem__(self, item) -> "{cls_name}":
ret = {cls_name}(self.image_size{none_args})
"""
)
for f in fields:
lines.append(
f"""
t = self._{f.name}
if t is not None:
ret._{f.name} = t[item]
"""
)
lines.append(
"""
return ret
"""
)
# support method `cat`
# this version does not contain checks that all instances have same size and fields
none_args = ", None" * len(fields)
lines.append(
f"""
def cat(self, instances: List["{cls_name}"]) -> "{cls_name}":
ret = {cls_name}(self.image_size{none_args})
"""
)
for f in fields:
lines.append(
f"""
t = self._{f.name}
if t is not None:
values: List[{f.annotation}] = [x.{f.name} for x in instances]
if torch.jit.isinstance(t, torch.Tensor):
ret._{f.name} = torch.cat(values, dim=0)
else:
ret._{f.name} = t.cat(values)
"""
)
lines.append(
"""
return ret"""
)
# support method `get_fields()`
lines.append(
"""
def get_fields(self) -> Dict[str, Tensor]:
ret = {}
"""
)
for f in fields:
if f.type_ == Boxes:
stmt = "t.tensor"
elif f.type_ == torch.Tensor:
stmt = "t"
else:
stmt = f'assert False, "unsupported type {str(f.type_)}"'
lines.append(
f"""
t = self._{f.name}
if t is not None:
ret["{f.name}"] = {stmt}
"""
)
lines.append(
"""
return ret"""
)
return cls_name, os.linesep.join(lines)
def _gen_instance_module(fields):
# TODO: find a more automatic way to enable import of other classes
s = """
from copy import deepcopy
import torch
from torch import Tensor
import typing
from typing import *
import detectron2
from detectron2.structures import Boxes, Instances
"""
cls_name, cls_def = _gen_instance_class(fields)
s += cls_def
return cls_name, s
def _import(path):
return _import_file(
"{}{}".format(sys.modules[__name__].__name__, _counter), path, make_importable=True
)
@contextmanager
def patch_builtin_len(modules=()):
"""
Patch the builtin len() function of a few detectron2 modules
to use __len__ instead, because __len__ does not convert values to
integers and therefore is friendly to tracing.
Args:
modules (list[stsr]): names of extra modules to patch len(), in
addition to those in detectron2.
"""
def _new_len(obj):
return obj.__len__()
with ExitStack() as stack:
MODULES = [
"detectron2.modeling.roi_heads.fast_rcnn",
"detectron2.modeling.roi_heads.mask_head",
"detectron2.modeling.roi_heads.keypoint_head",
] + list(modules)
ctxs = [stack.enter_context(mock.patch(mod + ".len")) for mod in MODULES]
for m in ctxs:
m.side_effect = _new_len
yield
def patch_nonscriptable_classes():
"""
Apply patches on a few nonscriptable detectron2 classes.
Should not have side-effects on eager usage.
"""
# __prepare_scriptable__ can also be added to models for easier maintenance.
# But it complicates the clean model code.
from detectron2.modeling.backbone import ResNet, FPN
# Due to https://github.com/pytorch/pytorch/issues/36061,
# we change backbone to use ModuleList for scripting.
# (note: this changes param names in state_dict)
def prepare_resnet(self):
ret = deepcopy(self)
ret.stages = nn.ModuleList(ret.stages)
for k in self.stage_names:
delattr(ret, k)
return ret
ResNet.__prepare_scriptable__ = prepare_resnet
def prepare_fpn(self):
ret = deepcopy(self)
ret.lateral_convs = nn.ModuleList(ret.lateral_convs)
ret.output_convs = nn.ModuleList(ret.output_convs)
for name, _ in self.named_children():
if name.startswith("fpn_"):
delattr(ret, name)
return ret
FPN.__prepare_scriptable__ = prepare_fpn
# Annotate some attributes to be constants for the purpose of scripting,
# even though they are not constants in eager mode.
from detectron2.modeling.roi_heads import StandardROIHeads
if hasattr(StandardROIHeads, "__annotations__"):
# copy first to avoid editing annotations of base class
StandardROIHeads.__annotations__ = deepcopy(StandardROIHeads.__annotations__)
StandardROIHeads.__annotations__["mask_on"] = torch.jit.Final[bool]
StandardROIHeads.__annotations__["keypoint_on"] = torch.jit.Final[bool]
# These patches are not supposed to have side-effects.
patch_nonscriptable_classes()
@contextmanager
def freeze_training_mode(model):
"""
A context manager that annotates the "training" attribute of every submodule
to constant, so that the training codepath in these modules can be
meta-compiled away. Upon exiting, the annotations are reverted.
"""
classes = {type(x) for x in model.modules()}
# __constants__ is the old way to annotate constants and not compatible
# with __annotations__ .
classes = {x for x in classes if not hasattr(x, "__constants__")}
for cls in classes:
cls.__annotations__["training"] = torch.jit.Final[bool]
yield
for cls in classes:
cls.__annotations__["training"] = bool
# Copyright (c) Facebook, Inc. and its affiliates.
from .batch_norm import FrozenBatchNorm2d, get_norm, NaiveSyncBatchNorm, CycleBatchNormList
from .deform_conv import DeformConv, ModulatedDeformConv
from .mask_ops import paste_masks_in_image
from .nms import batched_nms, batched_nms_rotated, nms, nms_rotated
from .roi_align import ROIAlign, roi_align
from .roi_align_rotated import ROIAlignRotated, roi_align_rotated
from .shape_spec import ShapeSpec
from .wrappers import (
BatchNorm2d,
Conv2d,
ConvTranspose2d,
cat,
interpolate,
Linear,
nonzero_tuple,
cross_entropy,
empty_input_loss_func_wrapper,
shapes_to_tensor,
move_device_like,
)
from .blocks import CNNBlockBase, DepthwiseSeparableConv2d
from .aspp import ASPP
from .losses import ciou_loss, diou_loss
__all__ = [k for k in globals().keys() if not k.startswith("_")]
# Copyright (c) Facebook, Inc. and its affiliates.
from copy import deepcopy
import fvcore.nn.weight_init as weight_init
import torch
from torch import nn
from torch.nn import functional as F
from .batch_norm import get_norm
from .blocks import DepthwiseSeparableConv2d
from .wrappers import Conv2d
class ASPP(nn.Module):
"""
Atrous Spatial Pyramid Pooling (ASPP).
"""
def __init__(
self,
in_channels,
out_channels,
dilations,
*,
norm,
activation,
pool_kernel_size=None,
dropout: float = 0.0,
use_depthwise_separable_conv=False,
):
"""
Args:
in_channels (int): number of input channels for ASPP.
out_channels (int): number of output channels.
dilations (list): a list of 3 dilations in ASPP.
norm (str or callable): normalization for all conv layers.
See :func:`layers.get_norm` for supported format. norm is
applied to all conv layers except the conv following
global average pooling.
activation (callable): activation function.
pool_kernel_size (tuple, list): the average pooling size (kh, kw)
for image pooling layer in ASPP. If set to None, it always
performs global average pooling. If not None, it must be
divisible by the shape of inputs in forward(). It is recommended
to use a fixed input feature size in training, and set this
option to match this size, so that it performs global average
pooling in training, and the size of the pooling window stays
consistent in inference.
dropout (float): apply dropout on the output of ASPP. It is used in
the official DeepLab implementation with a rate of 0.1:
https://github.com/tensorflow/models/blob/21b73d22f3ed05b650e85ac50849408dd36de32e/research/deeplab/model.py#L532 # noqa
use_depthwise_separable_conv (bool): use DepthwiseSeparableConv2d
for 3x3 convs in ASPP, proposed in :paper:`DeepLabV3+`.
"""
super(ASPP, self).__init__()
assert len(dilations) == 3, "ASPP expects 3 dilations, got {}".format(len(dilations))
self.pool_kernel_size = pool_kernel_size
self.dropout = dropout
use_bias = norm == ""
self.convs = nn.ModuleList()
# conv 1x1
self.convs.append(
Conv2d(
in_channels,
out_channels,
kernel_size=1,
bias=use_bias,
norm=get_norm(norm, out_channels),
activation=deepcopy(activation),
)
)
weight_init.c2_xavier_fill(self.convs[-1])
# atrous convs
for dilation in dilations:
if use_depthwise_separable_conv:
self.convs.append(
DepthwiseSeparableConv2d(
in_channels,
out_channels,
kernel_size=3,
padding=dilation,
dilation=dilation,
norm1=norm,
activation1=deepcopy(activation),
norm2=norm,
activation2=deepcopy(activation),
)
)
else:
self.convs.append(
Conv2d(
in_channels,
out_channels,
kernel_size=3,
padding=dilation,
dilation=dilation,
bias=use_bias,
norm=get_norm(norm, out_channels),
activation=deepcopy(activation),
)
)
weight_init.c2_xavier_fill(self.convs[-1])
# image pooling
# We do not add BatchNorm because the spatial resolution is 1x1,
# the original TF implementation has BatchNorm.
if pool_kernel_size is None:
image_pooling = nn.Sequential(
nn.AdaptiveAvgPool2d(1),
Conv2d(in_channels, out_channels, 1, bias=True, activation=deepcopy(activation)),
)
else:
image_pooling = nn.Sequential(
nn.AvgPool2d(kernel_size=pool_kernel_size, stride=1),
Conv2d(in_channels, out_channels, 1, bias=True, activation=deepcopy(activation)),
)
weight_init.c2_xavier_fill(image_pooling[1])
self.convs.append(image_pooling)
self.project = Conv2d(
5 * out_channels,
out_channels,
kernel_size=1,
bias=use_bias,
norm=get_norm(norm, out_channels),
activation=deepcopy(activation),
)
weight_init.c2_xavier_fill(self.project)
def forward(self, x):
size = x.shape[-2:]
if self.pool_kernel_size is not None:
if size[0] % self.pool_kernel_size[0] or size[1] % self.pool_kernel_size[1]:
raise ValueError(
"`pool_kernel_size` must be divisible by the shape of inputs. "
"Input size: {} `pool_kernel_size`: {}".format(size, self.pool_kernel_size)
)
res = []
for conv in self.convs:
res.append(conv(x))
res[-1] = F.interpolate(res[-1], size=size, mode="bilinear", align_corners=False)
res = torch.cat(res, dim=1)
res = self.project(res)
res = F.dropout(res, self.dropout, training=self.training) if self.dropout > 0 else res
return res
# Copyright (c) Facebook, Inc. and its affiliates.
import torch
import torch.distributed as dist
from fvcore.nn.distributed import differentiable_all_reduce
from torch import nn
from torch.nn import functional as F
from detectron2.utils import comm, env
from .wrappers import BatchNorm2d
class FrozenBatchNorm2d(nn.Module):
"""
BatchNorm2d where the batch statistics and the affine parameters are fixed.
It contains non-trainable buffers called
"weight" and "bias", "running_mean", "running_var",
initialized to perform identity transformation.
The pre-trained backbone models from Caffe2 only contain "weight" and "bias",
which are computed from the original four parameters of BN.
The affine transform `x * weight + bias` will perform the equivalent
computation of `(x - running_mean) / sqrt(running_var) * weight + bias`.
When loading a backbone model from Caffe2, "running_mean" and "running_var"
will be left unchanged as identity transformation.
Other pre-trained backbone models may contain all 4 parameters.
The forward is implemented by `F.batch_norm(..., training=False)`.
"""
_version = 3
def __init__(self, num_features, eps=1e-5):
super().__init__()
self.num_features = num_features
self.eps = eps
self.register_buffer("weight", torch.ones(num_features))
self.register_buffer("bias", torch.zeros(num_features))
self.register_buffer("running_mean", torch.zeros(num_features))
self.register_buffer("running_var", torch.ones(num_features) - eps)
self.register_buffer("num_batches_tracked", None)
def forward(self, x):
if x.requires_grad:
# When gradients are needed, F.batch_norm will use extra memory
# because its backward op computes gradients for weight/bias as well.
scale = self.weight * (self.running_var + self.eps).rsqrt()
bias = self.bias - self.running_mean * scale
scale = scale.reshape(1, -1, 1, 1)
bias = bias.reshape(1, -1, 1, 1)
out_dtype = x.dtype # may be half
return x * scale.to(out_dtype) + bias.to(out_dtype)
else:
# When gradients are not needed, F.batch_norm is a single fused op
# and provide more optimization opportunities.
return F.batch_norm(
x,
self.running_mean,
self.running_var,
self.weight,
self.bias,
training=False,
eps=self.eps,
)
def _load_from_state_dict(
self,
state_dict,
prefix,
local_metadata,
strict,
missing_keys,
unexpected_keys,
error_msgs,
):
version = local_metadata.get("version", None)
if version is None or version < 2:
# No running_mean/var in early versions
# This will silent the warnings
if prefix + "running_mean" not in state_dict:
state_dict[prefix + "running_mean"] = torch.zeros_like(self.running_mean)
if prefix + "running_var" not in state_dict:
state_dict[prefix + "running_var"] = torch.ones_like(self.running_var)
super()._load_from_state_dict(
state_dict,
prefix,
local_metadata,
strict,
missing_keys,
unexpected_keys,
error_msgs,
)
def __repr__(self):
return "FrozenBatchNorm2d(num_features={}, eps={})".format(self.num_features, self.eps)
@classmethod
def convert_frozen_batchnorm(cls, module):
"""
Convert all BatchNorm/SyncBatchNorm in module into FrozenBatchNorm.
Args:
module (torch.nn.Module):
Returns:
If module is BatchNorm/SyncBatchNorm, returns a new module.
Otherwise, in-place convert module and return it.
Similar to convert_sync_batchnorm in
https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/batchnorm.py
"""
bn_module = nn.modules.batchnorm
bn_module = (bn_module.BatchNorm2d, bn_module.SyncBatchNorm)
res = module
if isinstance(module, bn_module):
res = cls(module.num_features)
if module.affine:
res.weight.data = module.weight.data.clone().detach()
res.bias.data = module.bias.data.clone().detach()
res.running_mean.data = module.running_mean.data
res.running_var.data = module.running_var.data
res.eps = module.eps
res.num_batches_tracked = module.num_batches_tracked
else:
for name, child in module.named_children():
new_child = cls.convert_frozen_batchnorm(child)
if new_child is not child:
res.add_module(name, new_child)
return res
@classmethod
def convert_frozenbatchnorm2d_to_batchnorm2d(cls, module: nn.Module) -> nn.Module:
"""
Convert all FrozenBatchNorm2d to BatchNorm2d
Args:
module (torch.nn.Module):
Returns:
If module is FrozenBatchNorm2d, returns a new module.
Otherwise, in-place convert module and return it.
This is needed for quantization:
https://fb.workplace.com/groups/1043663463248667/permalink/1296330057982005/
"""
res = module
if isinstance(module, FrozenBatchNorm2d):
res = torch.nn.BatchNorm2d(module.num_features, module.eps)
res.weight.data = module.weight.data.clone().detach()
res.bias.data = module.bias.data.clone().detach()
res.running_mean.data = module.running_mean.data.clone().detach()
res.running_var.data = module.running_var.data.clone().detach()
res.eps = module.eps
res.num_batches_tracked = module.num_batches_tracked
else:
for name, child in module.named_children():
new_child = cls.convert_frozenbatchnorm2d_to_batchnorm2d(child)
if new_child is not child:
res.add_module(name, new_child)
return res
def get_norm(norm, out_channels):
"""
Args:
norm (str or callable): either one of BN, SyncBN, FrozenBN, GN;
or a callable that takes a channel number and returns
the normalization layer as a nn.Module.
Returns:
nn.Module or None: the normalization layer
"""
if norm is None:
return None
if isinstance(norm, str):
if len(norm) == 0:
return None
norm = {
"BN": BatchNorm2d,
# Fixed in https://github.com/pytorch/pytorch/pull/36382
"SyncBN": NaiveSyncBatchNorm if env.TORCH_VERSION <= (1, 5) else nn.SyncBatchNorm,
"FrozenBN": FrozenBatchNorm2d,
"GN": lambda channels: nn.GroupNorm(32, channels),
# for debugging:
"nnSyncBN": nn.SyncBatchNorm,
"naiveSyncBN": NaiveSyncBatchNorm,
# expose stats_mode N as an option to caller, required for zero-len inputs
"naiveSyncBN_N": lambda channels: NaiveSyncBatchNorm(channels, stats_mode="N"),
"LN": lambda channels: LayerNorm(channels),
}[norm]
return norm(out_channels)
class NaiveSyncBatchNorm(BatchNorm2d):
"""
In PyTorch<=1.5, ``nn.SyncBatchNorm`` has incorrect gradient
when the batch size on each worker is different.
(e.g., when scale augmentation is used, or when it is applied to mask head).
This is a slower but correct alternative to `nn.SyncBatchNorm`.
Note:
There isn't a single definition of Sync BatchNorm.
When ``stats_mode==""``, this module computes overall statistics by using
statistics of each worker with equal weight. The result is true statistics
of all samples (as if they are all on one worker) only when all workers
have the same (N, H, W). This mode does not support inputs with zero batch size.
When ``stats_mode=="N"``, this module computes overall statistics by weighting
the statistics of each worker by their ``N``. The result is true statistics
of all samples (as if they are all on one worker) only when all workers
have the same (H, W). It is slower than ``stats_mode==""``.
Even though the result of this module may not be the true statistics of all samples,
it may still be reasonable because it might be preferrable to assign equal weights
to all workers, regardless of their (H, W) dimension, instead of putting larger weight
on larger images. From preliminary experiments, little difference is found between such
a simplified implementation and an accurate computation of overall mean & variance.
"""
def __init__(self, *args, stats_mode="", **kwargs):
super().__init__(*args, **kwargs)
assert stats_mode in ["", "N"]
self._stats_mode = stats_mode
def forward(self, input):
if comm.get_world_size() == 1 or not self.training:
return super().forward(input)
B, C = input.shape[0], input.shape[1]
half_input = input.dtype == torch.float16
if half_input:
# fp16 does not have good enough numerics for the reduction here
input = input.float()
mean = torch.mean(input, dim=[0, 2, 3])
meansqr = torch.mean(input * input, dim=[0, 2, 3])
if self._stats_mode == "":
assert B > 0, 'SyncBatchNorm(stats_mode="") does not support zero batch size.'
vec = torch.cat([mean, meansqr], dim=0)
vec = differentiable_all_reduce(vec) * (1.0 / dist.get_world_size())
mean, meansqr = torch.split(vec, C)
momentum = self.momentum
else:
if B == 0:
vec = torch.zeros([2 * C + 1], device=mean.device, dtype=mean.dtype)
vec = vec + input.sum() # make sure there is gradient w.r.t input
else:
vec = torch.cat(
[
mean,
meansqr,
torch.ones([1], device=mean.device, dtype=mean.dtype),
],
dim=0,
)
vec = differentiable_all_reduce(vec * B)
total_batch = vec[-1].detach()
momentum = total_batch.clamp(max=1) * self.momentum # no update if total_batch is 0
mean, meansqr, _ = torch.split(vec / total_batch.clamp(min=1), C) # avoid div-by-zero
var = meansqr - mean * mean
invstd = torch.rsqrt(var + self.eps)
scale = self.weight * invstd
bias = self.bias - mean * scale
scale = scale.reshape(1, -1, 1, 1)
bias = bias.reshape(1, -1, 1, 1)
self.running_mean += momentum * (mean.detach() - self.running_mean)
self.running_var += momentum * (var.detach() - self.running_var)
ret = input * scale + bias
if half_input:
ret = ret.half()
return ret
class CycleBatchNormList(nn.ModuleList):
"""
Implement domain-specific BatchNorm by cycling.
When a BatchNorm layer is used for multiple input domains or input
features, it might need to maintain a separate test-time statistics
for each domain. See Sec 5.2 in :paper:`rethinking-batchnorm`.
This module implements it by using N separate BN layers
and it cycles through them every time a forward() is called.
NOTE: The caller of this module MUST guarantee to always call
this module by multiple of N times. Otherwise its test-time statistics
will be incorrect.
"""
def __init__(self, length: int, bn_class=nn.BatchNorm2d, **kwargs):
"""
Args:
length: number of BatchNorm layers to cycle.
bn_class: the BatchNorm class to use
kwargs: arguments of the BatchNorm class, such as num_features.
"""
self._affine = kwargs.pop("affine", True)
super().__init__([bn_class(**kwargs, affine=False) for k in range(length)])
if self._affine:
# shared affine, domain-specific BN
channels = self[0].num_features
self.weight = nn.Parameter(torch.ones(channels))
self.bias = nn.Parameter(torch.zeros(channels))
self._pos = 0
def forward(self, x):
ret = self[self._pos](x)
self._pos = (self._pos + 1) % len(self)
if self._affine:
w = self.weight.reshape(1, -1, 1, 1)
b = self.bias.reshape(1, -1, 1, 1)
return ret * w + b
else:
return ret
def extra_repr(self):
return f"affine={self._affine}"
class LayerNorm(nn.Module):
"""
A LayerNorm variant, popularized by Transformers, that performs point-wise mean and
variance normalization over the channel dimension for inputs that have shape
(batch_size, channels, height, width).
https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119 # noqa B950
"""
def __init__(self, normalized_shape, eps=1e-6):
super().__init__()
self.weight = nn.Parameter(torch.ones(normalized_shape))
self.bias = nn.Parameter(torch.zeros(normalized_shape))
self.eps = eps
self.normalized_shape = (normalized_shape,)
def forward(self, x):
u = x.mean(1, keepdim=True)
s = (x - u).pow(2).mean(1, keepdim=True)
x = (x - u) / torch.sqrt(s + self.eps)
x = self.weight[:, None, None] * x + self.bias[:, None, None]
return x
# -*- coding: utf-8 -*-
# Copyright (c) Facebook, Inc. and its affiliates.
import fvcore.nn.weight_init as weight_init
from torch import nn
from .batch_norm import FrozenBatchNorm2d, get_norm
from .wrappers import Conv2d
"""
CNN building blocks.
"""
class CNNBlockBase(nn.Module):
"""
A CNN block is assumed to have input channels, output channels and a stride.
The input and output of `forward()` method must be NCHW tensors.
The method can perform arbitrary computation but must match the given
channels and stride specification.
Attribute:
in_channels (int):
out_channels (int):
stride (int):
"""
def __init__(self, in_channels, out_channels, stride):
"""
The `__init__` method of any subclass should also contain these arguments.
Args:
in_channels (int):
out_channels (int):
stride (int):
"""
super().__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.stride = stride
def freeze(self):
"""
Make this block not trainable.
This method sets all parameters to `requires_grad=False`,
and convert all BatchNorm layers to FrozenBatchNorm
Returns:
the block itself
"""
for p in self.parameters():
p.requires_grad = False
FrozenBatchNorm2d.convert_frozen_batchnorm(self)
return self
class DepthwiseSeparableConv2d(nn.Module):
"""
A kxk depthwise convolution + a 1x1 convolution.
In :paper:`xception`, norm & activation are applied on the second conv.
:paper:`mobilenet` uses norm & activation on both convs.
"""
def __init__(
self,
in_channels,
out_channels,
kernel_size=3,
padding=1,
dilation=1,
*,
norm1=None,
activation1=None,
norm2=None,
activation2=None,
):
"""
Args:
norm1, norm2 (str or callable): normalization for the two conv layers.
activation1, activation2 (callable(Tensor) -> Tensor): activation
function for the two conv layers.
"""
super().__init__()
self.depthwise = Conv2d(
in_channels,
in_channels,
kernel_size=kernel_size,
padding=padding,
dilation=dilation,
groups=in_channels,
bias=not norm1,
norm=get_norm(norm1, in_channels),
activation=activation1,
)
self.pointwise = Conv2d(
in_channels,
out_channels,
kernel_size=1,
bias=not norm2,
norm=get_norm(norm2, out_channels),
activation=activation2,
)
# default initialization
weight_init.c2_msra_fill(self.depthwise)
weight_init.c2_msra_fill(self.pointwise)
def forward(self, x):
return self.pointwise(self.depthwise(x))
To add a new Op:
1. Create a new directory
2. Implement new ops there
3. Delcare its Python interface in `vision.cpp`.
// Copyright (c) Facebook, Inc. and its affiliates.
#pragma once
#include <torch/types.h>
namespace detectron2 {
at::Tensor ROIAlignRotated_forward_cpu(
const at::Tensor& input,
const at::Tensor& rois,
const float spatial_scale,
const int pooled_height,
const int pooled_width,
const int sampling_ratio);
at::Tensor ROIAlignRotated_backward_cpu(
const at::Tensor& grad,
const at::Tensor& rois,
const float spatial_scale,
const int pooled_height,
const int pooled_width,
const int batch_size,
const int channels,
const int height,
const int width,
const int sampling_ratio);
#if defined(WITH_CUDA) || defined(WITH_HIP)
at::Tensor ROIAlignRotated_forward_cuda(
const at::Tensor& input,
const at::Tensor& rois,
const float spatial_scale,
const int pooled_height,
const int pooled_width,
const int sampling_ratio);
at::Tensor ROIAlignRotated_backward_cuda(
const at::Tensor& grad,
const at::Tensor& rois,
const float spatial_scale,
const int pooled_height,
const int pooled_width,
const int batch_size,
const int channels,
const int height,
const int width,
const int sampling_ratio);
#endif
// Interface for Python
inline at::Tensor ROIAlignRotated_forward(
const at::Tensor& input,
const at::Tensor& rois,
const double spatial_scale,
const int64_t pooled_height,
const int64_t pooled_width,
const int64_t sampling_ratio) {
if (input.is_cuda()) {
#if defined(WITH_CUDA) || defined(WITH_HIP)
return ROIAlignRotated_forward_cuda(
input,
rois,
spatial_scale,
pooled_height,
pooled_width,
sampling_ratio);
#else
AT_ERROR("Detectron2 is not compiled with GPU support!");
#endif
}
return ROIAlignRotated_forward_cpu(
input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio);
}
inline at::Tensor ROIAlignRotated_backward(
const at::Tensor& grad,
const at::Tensor& rois,
const double spatial_scale,
const int64_t pooled_height,
const int64_t pooled_width,
const int64_t batch_size,
const int64_t channels,
const int64_t height,
const int64_t width,
const int64_t sampling_ratio) {
if (grad.is_cuda()) {
#if defined(WITH_CUDA) || defined(WITH_HIP)
return ROIAlignRotated_backward_cuda(
grad,
rois,
spatial_scale,
pooled_height,
pooled_width,
batch_size,
channels,
height,
width,
sampling_ratio);
#else
AT_ERROR("Detectron2 is not compiled with GPU support!");
#endif
}
return ROIAlignRotated_backward_cpu(
grad,
rois,
spatial_scale,
pooled_height,
pooled_width,
batch_size,
channels,
height,
width,
sampling_ratio);
}
} // namespace detectron2
// Copyright (c) Facebook, Inc. and its affiliates.
#include <ATen/TensorUtils.h>
#include "ROIAlignRotated.h"
// Note: this implementation originates from the Caffe2 ROIAlignRotated Op
// and PyTorch ROIAlign (non-rotated) Op implementations.
// The key difference between this implementation and those ones is
// we don't do "legacy offset" in this version, as there aren't many previous
// works, if any, using the "legacy" ROIAlignRotated Op.
// This would make the interface a bit cleaner.
namespace detectron2 {
namespace {
template <typename T>
struct PreCalc {
int pos1;
int pos2;
int pos3;
int pos4;
T w1;
T w2;
T w3;
T w4;
};
template <typename T>
void pre_calc_for_bilinear_interpolate(
const int height,
const int width,
const int pooled_height,
const int pooled_width,
const int iy_upper,
const int ix_upper,
T roi_start_h,
T roi_start_w,
T bin_size_h,
T bin_size_w,
int roi_bin_grid_h,
int roi_bin_grid_w,
T roi_center_h,
T roi_center_w,
T cos_theta,
T sin_theta,
std::vector<PreCalc<T>>& pre_calc) {
int pre_calc_index = 0;
for (int ph = 0; ph < pooled_height; ph++) {
for (int pw = 0; pw < pooled_width; pw++) {
for (int iy = 0; iy < iy_upper; iy++) {
const T yy = roi_start_h + ph * bin_size_h +
static_cast<T>(iy + .5f) * bin_size_h /
static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
for (int ix = 0; ix < ix_upper; ix++) {
const T xx = roi_start_w + pw * bin_size_w +
static_cast<T>(ix + .5f) * bin_size_w /
static_cast<T>(roi_bin_grid_w);
// Rotate by theta around the center and translate
// In image space, (y, x) is the order for Right Handed System,
// and this is essentially multiplying the point by a rotation matrix
// to rotate it counterclockwise through angle theta.
T y = yy * cos_theta - xx * sin_theta + roi_center_h;
T x = yy * sin_theta + xx * cos_theta + roi_center_w;
// deal with: inverse elements are out of feature map boundary
if (y < -1.0 || y > height || x < -1.0 || x > width) {
// empty
PreCalc<T> pc;
pc.pos1 = 0;
pc.pos2 = 0;
pc.pos3 = 0;
pc.pos4 = 0;
pc.w1 = 0;
pc.w2 = 0;
pc.w3 = 0;
pc.w4 = 0;
pre_calc[pre_calc_index] = pc;
pre_calc_index += 1;
continue;
}
if (y < 0) {
y = 0;
}
if (x < 0) {
x = 0;
}
int y_low = (int)y;
int x_low = (int)x;
int y_high;
int x_high;
if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = (T)y_low;
} else {
y_high = y_low + 1;
}
if (x_low >= width - 1) {
x_high = x_low = width - 1;
x = (T)x_low;
} else {
x_high = x_low + 1;
}
T ly = y - y_low;
T lx = x - x_low;
T hy = 1. - ly, hx = 1. - lx;
T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
// save weights and indices
PreCalc<T> pc;
pc.pos1 = y_low * width + x_low;
pc.pos2 = y_low * width + x_high;
pc.pos3 = y_high * width + x_low;
pc.pos4 = y_high * width + x_high;
pc.w1 = w1;
pc.w2 = w2;
pc.w3 = w3;
pc.w4 = w4;
pre_calc[pre_calc_index] = pc;
pre_calc_index += 1;
}
}
}
}
}
template <typename T>
void bilinear_interpolate_gradient(
const int height,
const int width,
T y,
T x,
T& w1,
T& w2,
T& w3,
T& w4,
int& x_low,
int& x_high,
int& y_low,
int& y_high) {
// deal with cases that inverse elements are out of feature map boundary
if (y < -1.0 || y > height || x < -1.0 || x > width) {
// empty
w1 = w2 = w3 = w4 = 0.;
x_low = x_high = y_low = y_high = -1;
return;
}
if (y < 0) {
y = 0;
}
if (x < 0) {
x = 0;
}
y_low = (int)y;
x_low = (int)x;
if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = (T)y_low;
} else {
y_high = y_low + 1;
}
if (x_low >= width - 1) {
x_high = x_low = width - 1;
x = (T)x_low;
} else {
x_high = x_low + 1;
}
T ly = y - y_low;
T lx = x - x_low;
T hy = 1. - ly, hx = 1. - lx;
// reference in forward
// T v1 = input[y_low * width + x_low];
// T v2 = input[y_low * width + x_high];
// T v3 = input[y_high * width + x_low];
// T v4 = input[y_high * width + x_high];
// T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
return;
}
template <class T>
inline void add(T* address, const T& val) {
*address += val;
}
} // namespace
template <typename T>
void ROIAlignRotatedForward(
const int nthreads,
const T* input,
const T& spatial_scale,
const int channels,
const int height,
const int width,
const int pooled_height,
const int pooled_width,
const int sampling_ratio,
const T* rois,
T* output) {
int n_rois = nthreads / channels / pooled_width / pooled_height;
// (n, c, ph, pw) is an element in the pooled output
// can be parallelized using omp
// #pragma omp parallel for num_threads(32)
for (int n = 0; n < n_rois; n++) {
int index_n = n * channels * pooled_width * pooled_height;
const T* current_roi = rois + n * 6;
int roi_batch_ind = current_roi[0];
// Do not use rounding; this implementation detail is critical
// ROIAlignRotated supports align == true, i.e., continuous coordinate
// by default, thus the 0.5 offset
T offset = (T)0.5;
T roi_center_w = current_roi[1] * spatial_scale - offset;
T roi_center_h = current_roi[2] * spatial_scale - offset;
T roi_width = current_roi[3] * spatial_scale;
T roi_height = current_roi[4] * spatial_scale;
T theta = current_roi[5] * M_PI / 180.0;
T cos_theta = cos(theta);
T sin_theta = sin(theta);
AT_ASSERTM(
roi_width >= 0 && roi_height >= 0,
"ROIs in ROIAlignRotated do not have non-negative size!");
T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
// We use roi_bin_grid to sample the grid and mimic integral
int roi_bin_grid_h = (sampling_ratio > 0)
? sampling_ratio
: ceil(roi_height / pooled_height); // e.g., = 2
int roi_bin_grid_w =
(sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
// We do average (integral) pooling inside a bin
const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
// we want to precalculate indices and weights shared by all channels,
// this is the key point of optimization
std::vector<PreCalc<T>> pre_calc(
roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
// roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
// Appropriate translation needs to be applied after.
T roi_start_h = -roi_height / 2.0;
T roi_start_w = -roi_width / 2.0;
pre_calc_for_bilinear_interpolate(
height,
width,
pooled_height,
pooled_width,
roi_bin_grid_h,
roi_bin_grid_w,
roi_start_h,
roi_start_w,
bin_size_h,
bin_size_w,
roi_bin_grid_h,
roi_bin_grid_w,
roi_center_h,
roi_center_w,
cos_theta,
sin_theta,
pre_calc);
for (int c = 0; c < channels; c++) {
int index_n_c = index_n + c * pooled_width * pooled_height;
const T* offset_input =
input + (roi_batch_ind * channels + c) * height * width;
int pre_calc_index = 0;
for (int ph = 0; ph < pooled_height; ph++) {
for (int pw = 0; pw < pooled_width; pw++) {
int index = index_n_c + ph * pooled_width + pw;
T output_val = 0.;
for (int iy = 0; iy < roi_bin_grid_h; iy++) {
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
PreCalc<T> pc = pre_calc[pre_calc_index];
output_val += pc.w1 * offset_input[pc.pos1] +
pc.w2 * offset_input[pc.pos2] +
pc.w3 * offset_input[pc.pos3] + pc.w4 * offset_input[pc.pos4];
pre_calc_index += 1;
}
}
output_val /= count;
output[index] = output_val;
} // for pw
} // for ph
} // for c
} // for n
}
template <typename T>
void ROIAlignRotatedBackward(
const int nthreads,
// may not be contiguous. should index using n_stride, etc
const T* grad_output,
const T& spatial_scale,
const int channels,
const int height,
const int width,
const int pooled_height,
const int pooled_width,
const int sampling_ratio,
T* grad_input,
const T* rois,
const int n_stride,
const int c_stride,
const int h_stride,
const int w_stride) {
for (int index = 0; index < nthreads; index++) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int c = (index / pooled_width / pooled_height) % channels;
int n = index / pooled_width / pooled_height / channels;
const T* current_roi = rois + n * 6;
int roi_batch_ind = current_roi[0];
// Do not use rounding; this implementation detail is critical
// ROIAlignRotated supports align == true, i.e., continuous coordinate
// by default, thus the 0.5 offset
T offset = (T)0.5;
T roi_center_w = current_roi[1] * spatial_scale - offset;
T roi_center_h = current_roi[2] * spatial_scale - offset;
T roi_width = current_roi[3] * spatial_scale;
T roi_height = current_roi[4] * spatial_scale;
T theta = current_roi[5] * M_PI / 180.0;
T cos_theta = cos(theta);
T sin_theta = sin(theta);
AT_ASSERTM(
roi_width >= 0 && roi_height >= 0,
"ROIs in ROIAlignRotated do not have non-negative size!");
T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
T* offset_grad_input =
grad_input + ((roi_batch_ind * channels + c) * height * width);
int output_offset = n * n_stride + c * c_stride;
const T* offset_grad_output = grad_output + output_offset;
const T grad_output_this_bin =
offset_grad_output[ph * h_stride + pw * w_stride];
// We use roi_bin_grid to sample the grid and mimic integral
int roi_bin_grid_h = (sampling_ratio > 0)
? sampling_ratio
: ceil(roi_height / pooled_height); // e.g., = 2
int roi_bin_grid_w =
(sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
// roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
// Appropriate translation needs to be applied after.
T roi_start_h = -roi_height / 2.0;
T roi_start_w = -roi_width / 2.0;
// We do average (integral) pooling inside a bin
const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
for (int iy = 0; iy < roi_bin_grid_h; iy++) {
const T yy = roi_start_h + ph * bin_size_h +
static_cast<T>(iy + .5f) * bin_size_h /
static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
const T xx = roi_start_w + pw * bin_size_w +
static_cast<T>(ix + .5f) * bin_size_w /
static_cast<T>(roi_bin_grid_w);
// Rotate by theta around the center and translate
T y = yy * cos_theta - xx * sin_theta + roi_center_h;
T x = yy * sin_theta + xx * cos_theta + roi_center_w;
T w1, w2, w3, w4;
int x_low, x_high, y_low, y_high;
bilinear_interpolate_gradient(
height, width, y, x, w1, w2, w3, w4, x_low, x_high, y_low, y_high);
T g1 = grad_output_this_bin * w1 / count;
T g2 = grad_output_this_bin * w2 / count;
T g3 = grad_output_this_bin * w3 / count;
T g4 = grad_output_this_bin * w4 / count;
if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
// atomic add is not needed for now since it is single threaded
add(offset_grad_input + y_low * width + x_low, static_cast<T>(g1));
add(offset_grad_input + y_low * width + x_high, static_cast<T>(g2));
add(offset_grad_input + y_high * width + x_low, static_cast<T>(g3));
add(offset_grad_input + y_high * width + x_high, static_cast<T>(g4));
} // if
} // ix
} // iy
} // for
} // ROIAlignRotatedBackward
at::Tensor ROIAlignRotated_forward_cpu(
const at::Tensor& input,
const at::Tensor& rois,
const float spatial_scale,
const int pooled_height,
const int pooled_width,
const int sampling_ratio) {
AT_ASSERTM(input.device().is_cpu(), "input must be a CPU tensor");
AT_ASSERTM(rois.device().is_cpu(), "rois must be a CPU tensor");
at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2};
at::CheckedFrom c = "ROIAlign_forward_cpu";
at::checkAllSameType(c, {input_t, rois_t});
auto num_rois = rois.size(0);
auto channels = input.size(1);
auto height = input.size(2);
auto width = input.size(3);
at::Tensor output = at::zeros(
{num_rois, channels, pooled_height, pooled_width}, input.options());
auto output_size = num_rois * pooled_height * pooled_width * channels;
if (output.numel() == 0) {
return output;
}
auto input_ = input.contiguous(), rois_ = rois.contiguous();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "ROIAlignRotated_forward", [&] {
ROIAlignRotatedForward<scalar_t>(
output_size,
input_.data_ptr<scalar_t>(),
spatial_scale,
channels,
height,
width,
pooled_height,
pooled_width,
sampling_ratio,
rois_.data_ptr<scalar_t>(),
output.data_ptr<scalar_t>());
});
return output;
}
at::Tensor ROIAlignRotated_backward_cpu(
const at::Tensor& grad,
const at::Tensor& rois,
const float spatial_scale,
const int pooled_height,
const int pooled_width,
const int batch_size,
const int channels,
const int height,
const int width,
const int sampling_ratio) {
AT_ASSERTM(grad.device().is_cpu(), "grad must be a CPU tensor");
AT_ASSERTM(rois.device().is_cpu(), "rois must be a CPU tensor");
at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2};
at::CheckedFrom c = "ROIAlignRotated_backward_cpu";
at::checkAllSameType(c, {grad_t, rois_t});
at::Tensor grad_input =
at::zeros({batch_size, channels, height, width}, grad.options());
// handle possibly empty gradients
if (grad.numel() == 0) {
return grad_input;
}
// get stride values to ensure indexing into gradients is correct.
int n_stride = grad.stride(0);
int c_stride = grad.stride(1);
int h_stride = grad.stride(2);
int w_stride = grad.stride(3);
auto rois_ = rois.contiguous();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
grad.scalar_type(), "ROIAlignRotated_forward", [&] {
ROIAlignRotatedBackward<scalar_t>(
grad.numel(),
grad.data_ptr<scalar_t>(),
spatial_scale,
channels,
height,
width,
pooled_height,
pooled_width,
sampling_ratio,
grad_input.data_ptr<scalar_t>(),
rois_.data_ptr<scalar_t>(),
n_stride,
c_stride,
h_stride,
w_stride);
});
return grad_input;
}
} // namespace detectron2
// Copyright (c) Facebook, Inc. and its affiliates.
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <ATen/cuda/CUDAApplyUtils.cuh>
// TODO make it in a common file
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
i += blockDim.x * gridDim.x)
// Note: this implementation originates from the Caffe2 ROIAlignRotated Op
// and PyTorch ROIAlign (non-rotated) Op implementations.
// The key difference between this implementation and those ones is
// we don't do "legacy offset" in this version, as there aren't many previous
// works, if any, using the "legacy" ROIAlignRotated Op.
// This would make the interface a bit cleaner.
namespace detectron2 {
namespace {
template <typename T>
__device__ T bilinear_interpolate(
const T* input,
const int height,
const int width,
T y,
T x) {
// deal with cases that inverse elements are out of feature map boundary
if (y < -1.0 || y > height || x < -1.0 || x > width) {
// empty
return 0;
}
if (y < 0) {
y = 0;
}
if (x < 0) {
x = 0;
}
int y_low = (int)y;
int x_low = (int)x;
int y_high;
int x_high;
if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = (T)y_low;
} else {
y_high = y_low + 1;
}
if (x_low >= width - 1) {
x_high = x_low = width - 1;
x = (T)x_low;
} else {
x_high = x_low + 1;
}
T ly = y - y_low;
T lx = x - x_low;
T hy = 1. - ly, hx = 1. - lx;
// do bilinear interpolation
T v1 = input[y_low * width + x_low];
T v2 = input[y_low * width + x_high];
T v3 = input[y_high * width + x_low];
T v4 = input[y_high * width + x_high];
T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
return val;
}
template <typename T>
__device__ void bilinear_interpolate_gradient(
const int height,
const int width,
T y,
T x,
T& w1,
T& w2,
T& w3,
T& w4,
int& x_low,
int& x_high,
int& y_low,
int& y_high) {
// deal with cases that inverse elements are out of feature map boundary
if (y < -1.0 || y > height || x < -1.0 || x > width) {
// empty
w1 = w2 = w3 = w4 = 0.;
x_low = x_high = y_low = y_high = -1;
return;
}
if (y < 0) {
y = 0;
}
if (x < 0) {
x = 0;
}
y_low = (int)y;
x_low = (int)x;
if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = (T)y_low;
} else {
y_high = y_low + 1;
}
if (x_low >= width - 1) {
x_high = x_low = width - 1;
x = (T)x_low;
} else {
x_high = x_low + 1;
}
T ly = y - y_low;
T lx = x - x_low;
T hy = 1. - ly, hx = 1. - lx;
// reference in forward
// T v1 = input[y_low * width + x_low];
// T v2 = input[y_low * width + x_high];
// T v3 = input[y_high * width + x_low];
// T v4 = input[y_high * width + x_high];
// T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
return;
}
} // namespace
template <typename T>
__global__ void RoIAlignRotatedForward(
const int nthreads,
const T* input,
const T spatial_scale,
const int channels,
const int height,
const int width,
const int pooled_height,
const int pooled_width,
const int sampling_ratio,
const T* rois,
T* top_data) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int c = (index / pooled_width / pooled_height) % channels;
int n = index / pooled_width / pooled_height / channels;
const T* current_roi = rois + n * 6;
int roi_batch_ind = current_roi[0];
// Do not use rounding; this implementation detail is critical
// ROIAlignRotated supports align == true, i.e., continuous coordinate
// by default, thus the 0.5 offset
T offset = (T)0.5;
T roi_center_w = current_roi[1] * spatial_scale - offset;
T roi_center_h = current_roi[2] * spatial_scale - offset;
T roi_width = current_roi[3] * spatial_scale;
T roi_height = current_roi[4] * spatial_scale;
T theta = current_roi[5] * M_PI / 180.0;
T cos_theta = cos(theta);
T sin_theta = sin(theta);
T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
const T* offset_input =
input + (roi_batch_ind * channels + c) * height * width;
// We use roi_bin_grid to sample the grid and mimic integral
int roi_bin_grid_h = (sampling_ratio > 0)
? sampling_ratio
: ceil(roi_height / pooled_height); // e.g., = 2
int roi_bin_grid_w =
(sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
// roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
// Appropriate translation needs to be applied after.
T roi_start_h = -roi_height / 2.0;
T roi_start_w = -roi_width / 2.0;
// We do average (inte gral) pooling inside a bin
const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
T output_val = 0.;
for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
{
const T yy = roi_start_h + ph * bin_size_h +
static_cast<T>(iy + .5f) * bin_size_h /
static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
const T xx = roi_start_w + pw * bin_size_w +
static_cast<T>(ix + .5f) * bin_size_w /
static_cast<T>(roi_bin_grid_w);
// Rotate by theta around the center and translate
T y = yy * cos_theta - xx * sin_theta + roi_center_h;
T x = yy * sin_theta + xx * cos_theta + roi_center_w;
T val = bilinear_interpolate(offset_input, height, width, y, x);
output_val += val;
}
}
output_val /= count;
top_data[index] = output_val;
}
}
template <typename T>
__global__ void RoIAlignRotatedBackwardFeature(
const int nthreads,
const T* top_diff,
const int num_rois,
const T spatial_scale,
const int channels,
const int height,
const int width,
const int pooled_height,
const int pooled_width,
const int sampling_ratio,
T* bottom_diff,
const T* rois) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int c = (index / pooled_width / pooled_height) % channels;
int n = index / pooled_width / pooled_height / channels;
const T* current_roi = rois + n * 6;
int roi_batch_ind = current_roi[0];
// Do not use rounding; this implementation detail is critical
// ROIAlignRotated supports align == true, i.e., continuous coordinate
// by default, thus the 0.5 offset
T offset = (T)0.5;
T roi_center_w = current_roi[1] * spatial_scale - offset;
T roi_center_h = current_roi[2] * spatial_scale - offset;
T roi_width = current_roi[3] * spatial_scale;
T roi_height = current_roi[4] * spatial_scale;
T theta = current_roi[5] * M_PI / 180.0;
T cos_theta = cos(theta);
T sin_theta = sin(theta);
T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
T* offset_bottom_diff =
bottom_diff + (roi_batch_ind * channels + c) * height * width;
int top_offset = (n * channels + c) * pooled_height * pooled_width;
const T* offset_top_diff = top_diff + top_offset;
const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
// We use roi_bin_grid to sample the grid and mimic integral
int roi_bin_grid_h = (sampling_ratio > 0)
? sampling_ratio
: ceil(roi_height / pooled_height); // e.g., = 2
int roi_bin_grid_w =
(sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
// roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
// Appropriate translation needs to be applied after.
T roi_start_h = -roi_height / 2.0;
T roi_start_w = -roi_width / 2.0;
// We do average (integral) pooling inside a bin
const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
{
const T yy = roi_start_h + ph * bin_size_h +
static_cast<T>(iy + .5f) * bin_size_h /
static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
const T xx = roi_start_w + pw * bin_size_w +
static_cast<T>(ix + .5f) * bin_size_w /
static_cast<T>(roi_bin_grid_w);
// Rotate by theta around the center and translate
T y = yy * cos_theta - xx * sin_theta + roi_center_h;
T x = yy * sin_theta + xx * cos_theta + roi_center_w;
T w1, w2, w3, w4;
int x_low, x_high, y_low, y_high;
bilinear_interpolate_gradient(
height, width, y, x, w1, w2, w3, w4, x_low, x_high, y_low, y_high);
T g1 = top_diff_this_bin * w1 / count;
T g2 = top_diff_this_bin * w2 / count;
T g3 = top_diff_this_bin * w3 / count;
T g4 = top_diff_this_bin * w4 / count;
if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
atomicAdd(
offset_bottom_diff + y_low * width + x_low, static_cast<T>(g1));
atomicAdd(
offset_bottom_diff + y_low * width + x_high, static_cast<T>(g2));
atomicAdd(
offset_bottom_diff + y_high * width + x_low, static_cast<T>(g3));
atomicAdd(
offset_bottom_diff + y_high * width + x_high, static_cast<T>(g4));
} // if
} // ix
} // iy
} // CUDA_1D_KERNEL_LOOP
} // RoIAlignRotatedBackward
at::Tensor ROIAlignRotated_forward_cuda(
const at::Tensor& input,
const at::Tensor& rois,
const float spatial_scale,
const int pooled_height,
const int pooled_width,
const int sampling_ratio) {
AT_ASSERTM(input.device().is_cuda(), "input must be a CUDA tensor");
AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2};
at::CheckedFrom c = "ROIAlignRotated_forward_cuda";
at::checkAllSameGPU(c, {input_t, rois_t});
at::checkAllSameType(c, {input_t, rois_t});
at::cuda::CUDAGuard device_guard(input.device());
auto num_rois = rois.size(0);
auto channels = input.size(1);
auto height = input.size(2);
auto width = input.size(3);
auto output = at::empty(
{num_rois, channels, pooled_height, pooled_width}, input.options());
auto output_size = num_rois * pooled_height * pooled_width * channels;
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 grid(std::min(
at::cuda::ATenCeilDiv(
static_cast<int64_t>(output_size), static_cast<int64_t>(512)),
static_cast<int64_t>(4096)));
dim3 block(512);
if (output.numel() == 0) {
AT_CUDA_CHECK(cudaGetLastError());
return output;
}
auto input_ = input.contiguous(), rois_ = rois.contiguous();
AT_DISPATCH_FLOATING_TYPES(
input.scalar_type(), "ROIAlignRotated_forward", [&] {
RoIAlignRotatedForward<scalar_t><<<grid, block, 0, stream>>>(
output_size,
input_.data_ptr<scalar_t>(),
spatial_scale,
channels,
height,
width,
pooled_height,
pooled_width,
sampling_ratio,
rois_.data_ptr<scalar_t>(),
output.data_ptr<scalar_t>());
});
cudaDeviceSynchronize();
AT_CUDA_CHECK(cudaGetLastError());
return output;
}
// TODO remove the dependency on input and use instead its sizes -> save memory
at::Tensor ROIAlignRotated_backward_cuda(
const at::Tensor& grad,
const at::Tensor& rois,
const float spatial_scale,
const int pooled_height,
const int pooled_width,
const int batch_size,
const int channels,
const int height,
const int width,
const int sampling_ratio) {
AT_ASSERTM(grad.device().is_cuda(), "grad must be a CUDA tensor");
AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2};
at::CheckedFrom c = "ROIAlign_backward_cuda";
at::checkAllSameGPU(c, {grad_t, rois_t});
at::checkAllSameType(c, {grad_t, rois_t});
at::cuda::CUDAGuard device_guard(grad.device());
auto num_rois = rois.size(0);
auto grad_input =
at::zeros({batch_size, channels, height, width}, grad.options());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 grid(std::min(
at::cuda::ATenCeilDiv(
static_cast<int64_t>(grad.numel()), static_cast<int64_t>(512)),
static_cast<int64_t>(4096)));
dim3 block(512);
// handle possibly empty gradients
if (grad.numel() == 0) {
AT_CUDA_CHECK(cudaGetLastError());
return grad_input;
}
auto grad_ = grad.contiguous(), rois_ = rois.contiguous();
AT_DISPATCH_FLOATING_TYPES(
grad.scalar_type(), "ROIAlignRotated_backward", [&] {
RoIAlignRotatedBackwardFeature<scalar_t><<<grid, block, 0, stream>>>(
grad.numel(),
grad_.data_ptr<scalar_t>(),
num_rois,
spatial_scale,
channels,
height,
width,
pooled_height,
pooled_width,
sampling_ratio,
grad_input.data_ptr<scalar_t>(),
rois_.data_ptr<scalar_t>());
});
AT_CUDA_CHECK(cudaGetLastError());
return grad_input;
}
} // namespace detectron2
// Copyright (c) Facebook, Inc. and its affiliates.
#pragma once
#include <torch/types.h>
namespace detectron2 {
at::Tensor box_iou_rotated_cpu(
const at::Tensor& boxes1,
const at::Tensor& boxes2);
#if defined(WITH_CUDA) || defined(WITH_HIP)
at::Tensor box_iou_rotated_cuda(
const at::Tensor& boxes1,
const at::Tensor& boxes2);
#endif
// Interface for Python
// inline is needed to prevent multiple function definitions when this header is
// included by different cpps
inline at::Tensor box_iou_rotated(
const at::Tensor& boxes1,
const at::Tensor& boxes2) {
assert(boxes1.device().is_cuda() == boxes2.device().is_cuda());
if (boxes1.device().is_cuda()) {
#if defined(WITH_CUDA) || defined(WITH_HIP)
return box_iou_rotated_cuda(boxes1.contiguous(), boxes2.contiguous());
#else
AT_ERROR("Detectron2 is not compiled with GPU support!");
#endif
}
return box_iou_rotated_cpu(boxes1.contiguous(), boxes2.contiguous());
}
} // namespace detectron2
// Copyright (c) Facebook, Inc. and its affiliates.
#include "box_iou_rotated.h"
#include "box_iou_rotated_utils.h"
namespace detectron2 {
template <typename T>
void box_iou_rotated_cpu_kernel(
const at::Tensor& boxes1,
const at::Tensor& boxes2,
at::Tensor& ious) {
auto num_boxes1 = boxes1.size(0);
auto num_boxes2 = boxes2.size(0);
for (int i = 0; i < num_boxes1; i++) {
for (int j = 0; j < num_boxes2; j++) {
ious[i * num_boxes2 + j] = single_box_iou_rotated<T>(
boxes1[i].data_ptr<T>(), boxes2[j].data_ptr<T>());
}
}
}
at::Tensor box_iou_rotated_cpu(
// input must be contiguous:
const at::Tensor& boxes1,
const at::Tensor& boxes2) {
auto num_boxes1 = boxes1.size(0);
auto num_boxes2 = boxes2.size(0);
at::Tensor ious =
at::empty({num_boxes1 * num_boxes2}, boxes1.options().dtype(at::kFloat));
box_iou_rotated_cpu_kernel<float>(boxes1, boxes2, ious);
// reshape from 1d array to 2d array
auto shape = std::vector<int64_t>{num_boxes1, num_boxes2};
return ious.reshape(shape);
}
} // namespace detectron2
// Copyright (c) Facebook, Inc. and its affiliates.
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <ATen/cuda/CUDAApplyUtils.cuh>
#include "box_iou_rotated_utils.h"
namespace detectron2 {
// 2D block with 32 * 16 = 512 threads per block
const int BLOCK_DIM_X = 32;
const int BLOCK_DIM_Y = 16;
template <typename T>
__global__ void box_iou_rotated_cuda_kernel(
const int n_boxes1,
const int n_boxes2,
const T* dev_boxes1,
const T* dev_boxes2,
T* dev_ious) {
const int row_start = blockIdx.x * blockDim.x;
const int col_start = blockIdx.y * blockDim.y;
const int row_size = min(n_boxes1 - row_start, blockDim.x);
const int col_size = min(n_boxes2 - col_start, blockDim.y);
__shared__ float block_boxes1[BLOCK_DIM_X * 5];
__shared__ float block_boxes2[BLOCK_DIM_Y * 5];
// It's safe to copy using threadIdx.x since BLOCK_DIM_X >= BLOCK_DIM_Y
if (threadIdx.x < row_size && threadIdx.y == 0) {
block_boxes1[threadIdx.x * 5 + 0] =
dev_boxes1[(row_start + threadIdx.x) * 5 + 0];
block_boxes1[threadIdx.x * 5 + 1] =
dev_boxes1[(row_start + threadIdx.x) * 5 + 1];
block_boxes1[threadIdx.x * 5 + 2] =
dev_boxes1[(row_start + threadIdx.x) * 5 + 2];
block_boxes1[threadIdx.x * 5 + 3] =
dev_boxes1[(row_start + threadIdx.x) * 5 + 3];
block_boxes1[threadIdx.x * 5 + 4] =
dev_boxes1[(row_start + threadIdx.x) * 5 + 4];
}
if (threadIdx.x < col_size && threadIdx.y == 0) {
block_boxes2[threadIdx.x * 5 + 0] =
dev_boxes2[(col_start + threadIdx.x) * 5 + 0];
block_boxes2[threadIdx.x * 5 + 1] =
dev_boxes2[(col_start + threadIdx.x) * 5 + 1];
block_boxes2[threadIdx.x * 5 + 2] =
dev_boxes2[(col_start + threadIdx.x) * 5 + 2];
block_boxes2[threadIdx.x * 5 + 3] =
dev_boxes2[(col_start + threadIdx.x) * 5 + 3];
block_boxes2[threadIdx.x * 5 + 4] =
dev_boxes2[(col_start + threadIdx.x) * 5 + 4];
}
__syncthreads();
if (threadIdx.x < row_size && threadIdx.y < col_size) {
int offset = (row_start + threadIdx.x) * n_boxes2 + col_start + threadIdx.y;
dev_ious[offset] = single_box_iou_rotated<T>(
block_boxes1 + threadIdx.x * 5, block_boxes2 + threadIdx.y * 5);
}
}
at::Tensor box_iou_rotated_cuda(
// input must be contiguous
const at::Tensor& boxes1,
const at::Tensor& boxes2) {
using scalar_t = float;
AT_ASSERTM(
boxes1.scalar_type() == at::kFloat, "boxes1 must be a float tensor");
AT_ASSERTM(
boxes2.scalar_type() == at::kFloat, "boxes2 must be a float tensor");
AT_ASSERTM(boxes1.is_cuda(), "boxes1 must be a CUDA tensor");
AT_ASSERTM(boxes2.is_cuda(), "boxes2 must be a CUDA tensor");
at::cuda::CUDAGuard device_guard(boxes1.device());
auto num_boxes1 = boxes1.size(0);
auto num_boxes2 = boxes2.size(0);
at::Tensor ious =
at::empty({num_boxes1 * num_boxes2}, boxes1.options().dtype(at::kFloat));
bool transpose = false;
if (num_boxes1 > 0 && num_boxes2 > 0) {
scalar_t *data1 = boxes1.data_ptr<scalar_t>(),
*data2 = boxes2.data_ptr<scalar_t>();
if (num_boxes2 > 65535 * BLOCK_DIM_Y) {
AT_ASSERTM(
num_boxes1 <= 65535 * BLOCK_DIM_Y,
"Too many boxes for box_iou_rotated_cuda!");
// x dim is allowed to be large, but y dim cannot,
// so we transpose the two to avoid "invalid configuration argument"
// error. We assume one of them is small. Otherwise the result is hard to
// fit in memory anyway.
std::swap(num_boxes1, num_boxes2);
std::swap(data1, data2);
transpose = true;
}
const int blocks_x =
at::cuda::ATenCeilDiv(static_cast<int>(num_boxes1), BLOCK_DIM_X);
const int blocks_y =
at::cuda::ATenCeilDiv(static_cast<int>(num_boxes2), BLOCK_DIM_Y);
dim3 blocks(blocks_x, blocks_y);
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y);
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
box_iou_rotated_cuda_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
num_boxes1,
num_boxes2,
data1,
data2,
(scalar_t*)ious.data_ptr<scalar_t>());
AT_CUDA_CHECK(cudaGetLastError());
}
// reshape from 1d array to 2d array
auto shape = std::vector<int64_t>{num_boxes1, num_boxes2};
if (transpose) {
return ious.view(shape).t();
} else {
return ious.view(shape);
}
}
} // namespace detectron2
// Copyright (c) Facebook, Inc. and its affiliates.
#pragma once
#include <cassert>
#include <cmath>
#if defined(__CUDACC__) || __HCC__ == 1 || __HIP__ == 1
// Designates functions callable from the host (CPU) and the device (GPU)
#define HOST_DEVICE __host__ __device__
#define HOST_DEVICE_INLINE HOST_DEVICE __forceinline__
#else
#include <algorithm>
#define HOST_DEVICE
#define HOST_DEVICE_INLINE HOST_DEVICE inline
#endif
namespace detectron2 {
namespace {
template <typename T>
struct RotatedBox {
T x_ctr, y_ctr, w, h, a;
};
template <typename T>
struct Point {
T x, y;
HOST_DEVICE_INLINE Point(const T& px = 0, const T& py = 0) : x(px), y(py) {}
HOST_DEVICE_INLINE Point operator+(const Point& p) const {
return Point(x + p.x, y + p.y);
}
HOST_DEVICE_INLINE Point& operator+=(const Point& p) {
x += p.x;
y += p.y;
return *this;
}
HOST_DEVICE_INLINE Point operator-(const Point& p) const {
return Point(x - p.x, y - p.y);
}
HOST_DEVICE_INLINE Point operator*(const T coeff) const {
return Point(x * coeff, y * coeff);
}
};
template <typename T>
HOST_DEVICE_INLINE T dot_2d(const Point<T>& A, const Point<T>& B) {
return A.x * B.x + A.y * B.y;
}
// R: result type. can be different from input type
template <typename T, typename R = T>
HOST_DEVICE_INLINE R cross_2d(const Point<T>& A, const Point<T>& B) {
return static_cast<R>(A.x) * static_cast<R>(B.y) -
static_cast<R>(B.x) * static_cast<R>(A.y);
}
template <typename T>
HOST_DEVICE_INLINE void get_rotated_vertices(
const RotatedBox<T>& box,
Point<T> (&pts)[4]) {
// M_PI / 180. == 0.01745329251
double theta = box.a * 0.01745329251;
T cosTheta2 = (T)cos(theta) * 0.5f;
T sinTheta2 = (T)sin(theta) * 0.5f;
// y: top --> down; x: left --> right
pts[0].x = box.x_ctr + sinTheta2 * box.h + cosTheta2 * box.w;
pts[0].y = box.y_ctr + cosTheta2 * box.h - sinTheta2 * box.w;
pts[1].x = box.x_ctr - sinTheta2 * box.h + cosTheta2 * box.w;
pts[1].y = box.y_ctr - cosTheta2 * box.h - sinTheta2 * box.w;
pts[2].x = 2 * box.x_ctr - pts[0].x;
pts[2].y = 2 * box.y_ctr - pts[0].y;
pts[3].x = 2 * box.x_ctr - pts[1].x;
pts[3].y = 2 * box.y_ctr - pts[1].y;
}
template <typename T>
HOST_DEVICE_INLINE int get_intersection_points(
const Point<T> (&pts1)[4],
const Point<T> (&pts2)[4],
Point<T> (&intersections)[24]) {
// Line vector
// A line from p1 to p2 is: p1 + (p2-p1)*t, t=[0,1]
Point<T> vec1[4], vec2[4];
for (int i = 0; i < 4; i++) {
vec1[i] = pts1[(i + 1) % 4] - pts1[i];
vec2[i] = pts2[(i + 1) % 4] - pts2[i];
}
// When computing the intersection area, it doesn't hurt if we have
// more (duplicated/approximate) intersections/vertices than needed,
// while it can cause drastic difference if we miss an intersection/vertex.
// Therefore, we add an epsilon to relax the comparisons between
// the float point numbers that decide the intersection points.
double EPS = 1e-5;
// Line test - test all line combos for intersection
int num = 0; // number of intersections
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 4; j++) {
// Solve for 2x2 Ax=b
T det = cross_2d<T>(vec2[j], vec1[i]);
// This takes care of parallel lines
if (fabs(det) <= 1e-14) {
continue;
}
auto vec12 = pts2[j] - pts1[i];
T t1 = cross_2d<T>(vec2[j], vec12) / det;
T t2 = cross_2d<T>(vec1[i], vec12) / det;
if (t1 > -EPS && t1 < 1.0f + EPS && t2 > -EPS && t2 < 1.0f + EPS) {
intersections[num++] = pts1[i] + vec1[i] * t1;
}
}
}
// Check for vertices of rect1 inside rect2
{
const auto& AB = vec2[0];
const auto& DA = vec2[3];
auto ABdotAB = dot_2d<T>(AB, AB);
auto ADdotAD = dot_2d<T>(DA, DA);
for (int i = 0; i < 4; i++) {
// assume ABCD is the rectangle, and P is the point to be judged
// P is inside ABCD iff. P's projection on AB lies within AB
// and P's projection on AD lies within AD
auto AP = pts1[i] - pts2[0];
auto APdotAB = dot_2d<T>(AP, AB);
auto APdotAD = -dot_2d<T>(AP, DA);
if ((APdotAB > -EPS) && (APdotAD > -EPS) && (APdotAB < ABdotAB + EPS) &&
(APdotAD < ADdotAD + EPS)) {
intersections[num++] = pts1[i];
}
}
}
// Reverse the check - check for vertices of rect2 inside rect1
{
const auto& AB = vec1[0];
const auto& DA = vec1[3];
auto ABdotAB = dot_2d<T>(AB, AB);
auto ADdotAD = dot_2d<T>(DA, DA);
for (int i = 0; i < 4; i++) {
auto AP = pts2[i] - pts1[0];
auto APdotAB = dot_2d<T>(AP, AB);
auto APdotAD = -dot_2d<T>(AP, DA);
if ((APdotAB > -EPS) && (APdotAD > -EPS) && (APdotAB < ABdotAB + EPS) &&
(APdotAD < ADdotAD + EPS)) {
intersections[num++] = pts2[i];
}
}
}
return num;
}
template <typename T>
HOST_DEVICE_INLINE int convex_hull_graham(
const Point<T> (&p)[24],
const int& num_in,
Point<T> (&q)[24],
bool shift_to_zero = false) {
assert(num_in >= 2);
// Step 1:
// Find point with minimum y
// if more than 1 points have the same minimum y,
// pick the one with the minimum x.
int t = 0;
for (int i = 1; i < num_in; i++) {
if (p[i].y < p[t].y || (p[i].y == p[t].y && p[i].x < p[t].x)) {
t = i;
}
}
auto& start = p[t]; // starting point
// Step 2:
// Subtract starting point from every points (for sorting in the next step)
for (int i = 0; i < num_in; i++) {
q[i] = p[i] - start;
}
// Swap the starting point to position 0
auto tmp = q[0];
q[0] = q[t];
q[t] = tmp;
// Step 3:
// Sort point 1 ~ num_in according to their relative cross-product values
// (essentially sorting according to angles)
// If the angles are the same, sort according to their distance to origin
T dist[24];
#if defined(__CUDACC__) || __HCC__ == 1 || __HIP__ == 1
// compute distance to origin before sort, and sort them together with the
// points
for (int i = 0; i < num_in; i++) {
dist[i] = dot_2d<T>(q[i], q[i]);
}
// CUDA version
// In the future, we can potentially use thrust
// for sorting here to improve speed (though not guaranteed)
for (int i = 1; i < num_in - 1; i++) {
for (int j = i + 1; j < num_in; j++) {
T crossProduct = cross_2d<T>(q[i], q[j]);
if ((crossProduct < -1e-6) ||
(fabs(crossProduct) < 1e-6 && dist[i] > dist[j])) {
auto q_tmp = q[i];
q[i] = q[j];
q[j] = q_tmp;
auto dist_tmp = dist[i];
dist[i] = dist[j];
dist[j] = dist_tmp;
}
}
}
#else
// CPU version
// std::sort(
// q + 1, q + num_in, [](const Point<T>& A, const Point<T>& B) -> bool {
// T temp = cross_2d<T>(A, B);
// if (fabs(temp) < 1e-6) {
// return dot_2d<T>(A, A) < dot_2d<T>(B, B);
// } else {
// return temp > 0;
// }
// });
for (int i = 0; i < num_in; i++) {
dist[i] = dot_2d<T>(q[i], q[i]);
}
for (int i = 1; i < num_in - 1; i++) {
for (int j = i + 1; j < num_in; j++) {
T crossProduct = cross_2d<T>(q[i], q[j]);
if ((crossProduct < -1e-6) ||
(fabs(crossProduct) < 1e-6 && dist[i] > dist[j])) {
auto q_tmp = q[i];
q[i] = q[j];
q[j] = q_tmp;
auto dist_tmp = dist[i];
dist[i] = dist[j];
dist[j] = dist_tmp;
}
}
}
// compute distance to origin after sort, since the points are now different.
for (int i = 0; i < num_in; i++) {
dist[i] = dot_2d<T>(q[i], q[i]);
}
#endif
// Step 4:
// Make sure there are at least 2 points (that don't overlap with each other)
// in the stack
int k; // index of the non-overlapped second point
for (k = 1; k < num_in; k++) {
if (dist[k] > 1e-8) {
break;
}
}
if (k == num_in) {
// We reach the end, which means the convex hull is just one point
q[0] = p[t];
return 1;
}
q[1] = q[k];
int m = 2; // 2 points in the stack
// Step 5:
// Finally we can start the scanning process.
// When a non-convex relationship between the 3 points is found
// (either concave shape or duplicated points),
// we pop the previous point from the stack
// until the 3-point relationship is convex again, or
// until the stack only contains two points
for (int i = k + 1; i < num_in; i++) {
while (m > 1) {
auto q1 = q[i] - q[m - 2], q2 = q[m - 1] - q[m - 2];
// cross_2d() uses FMA and therefore computes round(round(q1.x*q2.y) -
// q2.x*q1.y) So it may not return 0 even when q1==q2. Therefore we
// compare round(q1.x*q2.y) and round(q2.x*q1.y) directly. (round means
// round to nearest floating point).
if (q1.x * q2.y >= q2.x * q1.y)
m--;
else
break;
}
// Using double also helps, but float can solve the issue for now.
// while (m > 1 && cross_2d<T, double>(q[i] - q[m - 2], q[m - 1] - q[m - 2])
// >= 0) {
// m--;
// }
q[m++] = q[i];
}
// Step 6 (Optional):
// In general sense we need the original coordinates, so we
// need to shift the points back (reverting Step 2)
// But if we're only interested in getting the area/perimeter of the shape
// We can simply return.
if (!shift_to_zero) {
for (int i = 0; i < m; i++) {
q[i] += start;
}
}
return m;
}
template <typename T>
HOST_DEVICE_INLINE T polygon_area(const Point<T> (&q)[24], const int& m) {
if (m <= 2) {
return 0;
}
T area = 0;
for (int i = 1; i < m - 1; i++) {
area += fabs(cross_2d<T>(q[i] - q[0], q[i + 1] - q[0]));
}
return area / 2.0;
}
template <typename T>
HOST_DEVICE_INLINE T rotated_boxes_intersection(
const RotatedBox<T>& box1,
const RotatedBox<T>& box2) {
// There are up to 4 x 4 + 4 + 4 = 24 intersections (including dups) returned
// from rotated_rect_intersection_pts
Point<T> intersectPts[24], orderedPts[24];
Point<T> pts1[4];
Point<T> pts2[4];
get_rotated_vertices<T>(box1, pts1);
get_rotated_vertices<T>(box2, pts2);
int num = get_intersection_points<T>(pts1, pts2, intersectPts);
if (num <= 2) {
return 0.0;
}
// Convex Hull to order the intersection points in clockwise order and find
// the contour area.
int num_convex = convex_hull_graham<T>(intersectPts, num, orderedPts, true);
return polygon_area<T>(orderedPts, num_convex);
}
} // namespace
template <typename T>
HOST_DEVICE_INLINE T
single_box_iou_rotated(T const* const box1_raw, T const* const box2_raw) {
// shift center to the middle point to achieve higher precision in result
RotatedBox<T> box1, box2;
auto center_shift_x = (box1_raw[0] + box2_raw[0]) / 2.0;
auto center_shift_y = (box1_raw[1] + box2_raw[1]) / 2.0;
box1.x_ctr = box1_raw[0] - center_shift_x;
box1.y_ctr = box1_raw[1] - center_shift_y;
box1.w = box1_raw[2];
box1.h = box1_raw[3];
box1.a = box1_raw[4];
box2.x_ctr = box2_raw[0] - center_shift_x;
box2.y_ctr = box2_raw[1] - center_shift_y;
box2.w = box2_raw[2];
box2.h = box2_raw[3];
box2.a = box2_raw[4];
T area1 = box1.w * box1.h;
T area2 = box2.w * box2.h;
if (area1 < 1e-14 || area2 < 1e-14) {
return 0.f;
}
T intersection = rotated_boxes_intersection<T>(box1, box2);
T iou = intersection / (area1 + area2 - intersection);
return iou;
}
} // namespace detectron2
// Copyright (c) Facebook, Inc. and its affiliates.
#include "cocoeval.h"
#include <time.h>
#include <algorithm>
#include <cstdint>
#include <numeric>
using namespace pybind11::literals;
namespace detectron2 {
namespace COCOeval {
// Sort detections from highest score to lowest, such that
// detection_instances[detection_sorted_indices[t]] >=
// detection_instances[detection_sorted_indices[t+1]]. Use stable_sort to match
// original COCO API
void SortInstancesByDetectionScore(
const std::vector<InstanceAnnotation>& detection_instances,
std::vector<uint64_t>* detection_sorted_indices) {
detection_sorted_indices->resize(detection_instances.size());
std::iota(
detection_sorted_indices->begin(), detection_sorted_indices->end(), 0);
std::stable_sort(
detection_sorted_indices->begin(),
detection_sorted_indices->end(),
[&detection_instances](size_t j1, size_t j2) {
return detection_instances[j1].score > detection_instances[j2].score;
});
}
// Partition the ground truth objects based on whether or not to ignore them
// based on area
void SortInstancesByIgnore(
const std::array<double, 2>& area_range,
const std::vector<InstanceAnnotation>& ground_truth_instances,
std::vector<uint64_t>* ground_truth_sorted_indices,
std::vector<bool>* ignores) {
ignores->clear();
ignores->reserve(ground_truth_instances.size());
for (auto o : ground_truth_instances) {
ignores->push_back(
o.ignore || o.area < area_range[0] || o.area > area_range[1]);
}
ground_truth_sorted_indices->resize(ground_truth_instances.size());
std::iota(
ground_truth_sorted_indices->begin(),
ground_truth_sorted_indices->end(),
0);
std::stable_sort(
ground_truth_sorted_indices->begin(),
ground_truth_sorted_indices->end(),
[&ignores](size_t j1, size_t j2) {
return (int)(*ignores)[j1] < (int)(*ignores)[j2];
});
}
// For each IOU threshold, greedily match each detected instance to a ground
// truth instance (if possible) and store the results
void MatchDetectionsToGroundTruth(
const std::vector<InstanceAnnotation>& detection_instances,
const std::vector<uint64_t>& detection_sorted_indices,
const std::vector<InstanceAnnotation>& ground_truth_instances,
const std::vector<uint64_t>& ground_truth_sorted_indices,
const std::vector<bool>& ignores,
const std::vector<std::vector<double>>& ious,
const std::vector<double>& iou_thresholds,
const std::array<double, 2>& area_range,
ImageEvaluation* results) {
// Initialize memory to store return data matches and ignore
const int num_iou_thresholds = iou_thresholds.size();
const int num_ground_truth = ground_truth_sorted_indices.size();
const int num_detections = detection_sorted_indices.size();
std::vector<uint64_t> ground_truth_matches(
num_iou_thresholds * num_ground_truth, 0);
std::vector<uint64_t>& detection_matches = results->detection_matches;
std::vector<bool>& detection_ignores = results->detection_ignores;
std::vector<bool>& ground_truth_ignores = results->ground_truth_ignores;
detection_matches.resize(num_iou_thresholds * num_detections, 0);
detection_ignores.resize(num_iou_thresholds * num_detections, false);
ground_truth_ignores.resize(num_ground_truth);
for (auto g = 0; g < num_ground_truth; ++g) {
ground_truth_ignores[g] = ignores[ground_truth_sorted_indices[g]];
}
for (auto t = 0; t < num_iou_thresholds; ++t) {
for (auto d = 0; d < num_detections; ++d) {
// information about best match so far (match=-1 -> unmatched)
double best_iou = std::min(iou_thresholds[t], 1 - 1e-10);
int match = -1;
for (auto g = 0; g < num_ground_truth; ++g) {
// if this ground truth instance is already matched and not a
// crowd, it cannot be matched to another detection
if (ground_truth_matches[t * num_ground_truth + g] > 0 &&
!ground_truth_instances[ground_truth_sorted_indices[g]].is_crowd) {
continue;
}
// if detected instance matched to a regular ground truth
// instance, we can break on the first ground truth instance
// tagged as ignore (because they are sorted by the ignore tag)
if (match >= 0 && !ground_truth_ignores[match] &&
ground_truth_ignores[g]) {
break;
}
// if IOU overlap is the best so far, store the match appropriately
if (ious[d][ground_truth_sorted_indices[g]] >= best_iou) {
best_iou = ious[d][ground_truth_sorted_indices[g]];
match = g;
}
}
// if match was made, store id of match for both detection and
// ground truth
if (match >= 0) {
detection_ignores[t * num_detections + d] = ground_truth_ignores[match];
detection_matches[t * num_detections + d] =
ground_truth_instances[ground_truth_sorted_indices[match]].id;
ground_truth_matches[t * num_ground_truth + match] =
detection_instances[detection_sorted_indices[d]].id;
}
// set unmatched detections outside of area range to ignore
const InstanceAnnotation& detection =
detection_instances[detection_sorted_indices[d]];
detection_ignores[t * num_detections + d] =
detection_ignores[t * num_detections + d] ||
(detection_matches[t * num_detections + d] == 0 &&
(detection.area < area_range[0] || detection.area > area_range[1]));
}
}
// store detection score results
results->detection_scores.resize(detection_sorted_indices.size());
for (size_t d = 0; d < detection_sorted_indices.size(); ++d) {
results->detection_scores[d] =
detection_instances[detection_sorted_indices[d]].score;
}
}
std::vector<ImageEvaluation> EvaluateImages(
const std::vector<std::array<double, 2>>& area_ranges,
int max_detections,
const std::vector<double>& iou_thresholds,
const ImageCategoryInstances<std::vector<double>>& image_category_ious,
const ImageCategoryInstances<InstanceAnnotation>&
image_category_ground_truth_instances,
const ImageCategoryInstances<InstanceAnnotation>&
image_category_detection_instances) {
const int num_area_ranges = area_ranges.size();
const int num_images = image_category_ground_truth_instances.size();
const int num_categories =
image_category_ious.size() > 0 ? image_category_ious[0].size() : 0;
std::vector<uint64_t> detection_sorted_indices;
std::vector<uint64_t> ground_truth_sorted_indices;
std::vector<bool> ignores;
std::vector<ImageEvaluation> results_all(
num_images * num_area_ranges * num_categories);
// Store results for each image, category, and area range combination. Results
// for each IOU threshold are packed into the same ImageEvaluation object
for (auto i = 0; i < num_images; ++i) {
for (auto c = 0; c < num_categories; ++c) {
const std::vector<InstanceAnnotation>& ground_truth_instances =
image_category_ground_truth_instances[i][c];
const std::vector<InstanceAnnotation>& detection_instances =
image_category_detection_instances[i][c];
SortInstancesByDetectionScore(
detection_instances, &detection_sorted_indices);
if ((int)detection_sorted_indices.size() > max_detections) {
detection_sorted_indices.resize(max_detections);
}
for (size_t a = 0; a < area_ranges.size(); ++a) {
SortInstancesByIgnore(
area_ranges[a],
ground_truth_instances,
&ground_truth_sorted_indices,
&ignores);
MatchDetectionsToGroundTruth(
detection_instances,
detection_sorted_indices,
ground_truth_instances,
ground_truth_sorted_indices,
ignores,
image_category_ious[i][c],
iou_thresholds,
area_ranges[a],
&results_all
[c * num_area_ranges * num_images + a * num_images + i]);
}
}
}
return results_all;
}
// Convert a python list to a vector
template <typename T>
std::vector<T> list_to_vec(const py::list& l) {
std::vector<T> v(py::len(l));
for (int i = 0; i < (int)py::len(l); ++i) {
v[i] = l[i].cast<T>();
}
return v;
}
// Helper function to Accumulate()
// Considers the evaluation results applicable to a particular category, area
// range, and max_detections parameter setting, which begin at
// evaluations[evaluation_index]. Extracts a sorted list of length n of all
// applicable detection instances concatenated across all images in the dataset,
// which are represented by the outputs evaluation_indices, detection_scores,
// image_detection_indices, and detection_sorted_indices--all of which are
// length n. evaluation_indices[i] stores the applicable index into
// evaluations[] for instance i, which has detection score detection_score[i],
// and is the image_detection_indices[i]'th of the list of detections
// for the image containing i. detection_sorted_indices[] defines a sorted
// permutation of the 3 other outputs
int BuildSortedDetectionList(
const std::vector<ImageEvaluation>& evaluations,
const int64_t evaluation_index,
const int64_t num_images,
const int max_detections,
std::vector<uint64_t>* evaluation_indices,
std::vector<double>* detection_scores,
std::vector<uint64_t>* detection_sorted_indices,
std::vector<uint64_t>* image_detection_indices) {
assert(evaluations.size() >= evaluation_index + num_images);
// Extract a list of object instances of the applicable category, area
// range, and max detections requirements such that they can be sorted
image_detection_indices->clear();
evaluation_indices->clear();
detection_scores->clear();
image_detection_indices->reserve(num_images * max_detections);
evaluation_indices->reserve(num_images * max_detections);
detection_scores->reserve(num_images * max_detections);
int num_valid_ground_truth = 0;
for (auto i = 0; i < num_images; ++i) {
const ImageEvaluation& evaluation = evaluations[evaluation_index + i];
for (int d = 0;
d < (int)evaluation.detection_scores.size() && d < max_detections;
++d) { // detected instances
evaluation_indices->push_back(evaluation_index + i);
image_detection_indices->push_back(d);
detection_scores->push_back(evaluation.detection_scores[d]);
}
for (auto ground_truth_ignore : evaluation.ground_truth_ignores) {
if (!ground_truth_ignore) {
++num_valid_ground_truth;
}
}
}
// Sort detections by decreasing score, using stable sort to match
// python implementation
detection_sorted_indices->resize(detection_scores->size());
std::iota(
detection_sorted_indices->begin(), detection_sorted_indices->end(), 0);
std::stable_sort(
detection_sorted_indices->begin(),
detection_sorted_indices->end(),
[&detection_scores](size_t j1, size_t j2) {
return (*detection_scores)[j1] > (*detection_scores)[j2];
});
return num_valid_ground_truth;
}
// Helper function to Accumulate()
// Compute a precision recall curve given a sorted list of detected instances
// encoded in evaluations, evaluation_indices, detection_scores,
// detection_sorted_indices, image_detection_indices (see
// BuildSortedDetectionList()). Using vectors precisions and recalls
// and temporary storage, output the results into precisions_out, recalls_out,
// and scores_out, which are large buffers containing many precion/recall curves
// for all possible parameter settings, with precisions_out_index and
// recalls_out_index defining the applicable indices to store results.
void ComputePrecisionRecallCurve(
const int64_t precisions_out_index,
const int64_t precisions_out_stride,
const int64_t recalls_out_index,
const std::vector<double>& recall_thresholds,
const int iou_threshold_index,
const int num_iou_thresholds,
const int num_valid_ground_truth,
const std::vector<ImageEvaluation>& evaluations,
const std::vector<uint64_t>& evaluation_indices,
const std::vector<double>& detection_scores,
const std::vector<uint64_t>& detection_sorted_indices,
const std::vector<uint64_t>& image_detection_indices,
std::vector<double>* precisions,
std::vector<double>* recalls,
std::vector<double>* precisions_out,
std::vector<double>* scores_out,
std::vector<double>* recalls_out) {
assert(recalls_out->size() > recalls_out_index);
// Compute precision/recall for each instance in the sorted list of detections
int64_t true_positives_sum = 0, false_positives_sum = 0;
precisions->clear();
recalls->clear();
precisions->reserve(detection_sorted_indices.size());
recalls->reserve(detection_sorted_indices.size());
assert(!evaluations.empty() || detection_sorted_indices.empty());
for (auto detection_sorted_index : detection_sorted_indices) {
const ImageEvaluation& evaluation =
evaluations[evaluation_indices[detection_sorted_index]];
const auto num_detections =
evaluation.detection_matches.size() / num_iou_thresholds;
const auto detection_index = iou_threshold_index * num_detections +
image_detection_indices[detection_sorted_index];
assert(evaluation.detection_matches.size() > detection_index);
assert(evaluation.detection_ignores.size() > detection_index);
const int64_t detection_match =
evaluation.detection_matches[detection_index];
const bool detection_ignores =
evaluation.detection_ignores[detection_index];
const auto true_positive = detection_match > 0 && !detection_ignores;
const auto false_positive = detection_match == 0 && !detection_ignores;
if (true_positive) {
++true_positives_sum;
}
if (false_positive) {
++false_positives_sum;
}
const double recall =
static_cast<double>(true_positives_sum) / num_valid_ground_truth;
recalls->push_back(recall);
const int64_t num_valid_detections =
true_positives_sum + false_positives_sum;
const double precision = num_valid_detections > 0
? static_cast<double>(true_positives_sum) / num_valid_detections
: 0.0;
precisions->push_back(precision);
}
(*recalls_out)[recalls_out_index] = !recalls->empty() ? recalls->back() : 0;
for (int64_t i = static_cast<int64_t>(precisions->size()) - 1; i > 0; --i) {
if ((*precisions)[i] > (*precisions)[i - 1]) {
(*precisions)[i - 1] = (*precisions)[i];
}
}
// Sample the per instance precision/recall list at each recall threshold
for (size_t r = 0; r < recall_thresholds.size(); ++r) {
// first index in recalls >= recall_thresholds[r]
std::vector<double>::iterator low = std::lower_bound(
recalls->begin(), recalls->end(), recall_thresholds[r]);
size_t precisions_index = low - recalls->begin();
const auto results_ind = precisions_out_index + r * precisions_out_stride;
assert(results_ind < precisions_out->size());
assert(results_ind < scores_out->size());
if (precisions_index < precisions->size()) {
(*precisions_out)[results_ind] = (*precisions)[precisions_index];
(*scores_out)[results_ind] =
detection_scores[detection_sorted_indices[precisions_index]];
} else {
(*precisions_out)[results_ind] = 0;
(*scores_out)[results_ind] = 0;
}
}
}
py::dict Accumulate(
const py::object& params,
const std::vector<ImageEvaluation>& evaluations) {
const std::vector<double> recall_thresholds =
list_to_vec<double>(params.attr("recThrs"));
const std::vector<int> max_detections =
list_to_vec<int>(params.attr("maxDets"));
const int num_iou_thresholds = py::len(params.attr("iouThrs"));
const int num_recall_thresholds = py::len(params.attr("recThrs"));
const int num_categories = params.attr("useCats").cast<int>() == 1
? py::len(params.attr("catIds"))
: 1;
const int num_area_ranges = py::len(params.attr("areaRng"));
const int num_max_detections = py::len(params.attr("maxDets"));
const int num_images = py::len(params.attr("imgIds"));
std::vector<double> precisions_out(
num_iou_thresholds * num_recall_thresholds * num_categories *
num_area_ranges * num_max_detections,
-1);
std::vector<double> recalls_out(
num_iou_thresholds * num_categories * num_area_ranges *
num_max_detections,
-1);
std::vector<double> scores_out(
num_iou_thresholds * num_recall_thresholds * num_categories *
num_area_ranges * num_max_detections,
-1);
// Consider the list of all detected instances in the entire dataset in one
// large list. evaluation_indices, detection_scores,
// image_detection_indices, and detection_sorted_indices all have the same
// length as this list, such that each entry corresponds to one detected
// instance
std::vector<uint64_t> evaluation_indices; // indices into evaluations[]
std::vector<double> detection_scores; // detection scores of each instance
std::vector<uint64_t> detection_sorted_indices; // sorted indices of all
// instances in the dataset
std::vector<uint64_t>
image_detection_indices; // indices into the list of detected instances in
// the same image as each instance
std::vector<double> precisions, recalls;
for (auto c = 0; c < num_categories; ++c) {
for (auto a = 0; a < num_area_ranges; ++a) {
for (auto m = 0; m < num_max_detections; ++m) {
// The COCO PythonAPI assumes evaluations[] (the return value of
// COCOeval::EvaluateImages() is one long list storing results for each
// combination of category, area range, and image id, with categories in
// the outermost loop and images in the innermost loop.
const int64_t evaluations_index =
c * num_area_ranges * num_images + a * num_images;
int num_valid_ground_truth = BuildSortedDetectionList(
evaluations,
evaluations_index,
num_images,
max_detections[m],
&evaluation_indices,
&detection_scores,
&detection_sorted_indices,
&image_detection_indices);
if (num_valid_ground_truth == 0) {
continue;
}
for (auto t = 0; t < num_iou_thresholds; ++t) {
// recalls_out is a flattened vectors representing a
// num_iou_thresholds X num_categories X num_area_ranges X
// num_max_detections matrix
const int64_t recalls_out_index =
t * num_categories * num_area_ranges * num_max_detections +
c * num_area_ranges * num_max_detections +
a * num_max_detections + m;
// precisions_out and scores_out are flattened vectors
// representing a num_iou_thresholds X num_recall_thresholds X
// num_categories X num_area_ranges X num_max_detections matrix
const int64_t precisions_out_stride =
num_categories * num_area_ranges * num_max_detections;
const int64_t precisions_out_index = t * num_recall_thresholds *
num_categories * num_area_ranges * num_max_detections +
c * num_area_ranges * num_max_detections +
a * num_max_detections + m;
ComputePrecisionRecallCurve(
precisions_out_index,
precisions_out_stride,
recalls_out_index,
recall_thresholds,
t,
num_iou_thresholds,
num_valid_ground_truth,
evaluations,
evaluation_indices,
detection_scores,
detection_sorted_indices,
image_detection_indices,
&precisions,
&recalls,
&precisions_out,
&scores_out,
&recalls_out);
}
}
}
}
time_t rawtime;
struct tm local_time;
std::array<char, 200> buffer;
time(&rawtime);
#ifdef _WIN32
localtime_s(&local_time, &rawtime);
#else
localtime_r(&rawtime, &local_time);
#endif
strftime(
buffer.data(), 200, "%Y-%m-%d %H:%num_max_detections:%S", &local_time);
return py::dict(
"params"_a = params,
"counts"_a = std::vector<int64_t>(
{num_iou_thresholds,
num_recall_thresholds,
num_categories,
num_area_ranges,
num_max_detections}),
"date"_a = buffer,
"precision"_a = precisions_out,
"recall"_a = recalls_out,
"scores"_a = scores_out);
}
} // namespace COCOeval
} // namespace detectron2
// Copyright (c) Facebook, Inc. and its affiliates.
#pragma once
#include <pybind11/numpy.h>
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <pybind11/stl_bind.h>
#include <vector>
namespace py = pybind11;
namespace detectron2 {
namespace COCOeval {
// Annotation data for a single object instance in an image
struct InstanceAnnotation {
InstanceAnnotation(
uint64_t id,
double score,
double area,
bool is_crowd,
bool ignore)
: id{id}, score{score}, area{area}, is_crowd{is_crowd}, ignore{ignore} {}
uint64_t id;
double score = 0.;
double area = 0.;
bool is_crowd = false;
bool ignore = false;
};
// Stores intermediate results for evaluating detection results for a single
// image that has D detected instances and G ground truth instances. This stores
// matches between detected and ground truth instances
struct ImageEvaluation {
// For each of the D detected instances, the id of the matched ground truth
// instance, or 0 if unmatched
std::vector<uint64_t> detection_matches;
// The detection score of each of the D detected instances
std::vector<double> detection_scores;
// Marks whether or not each of G instances was ignored from evaluation (e.g.,
// because it's outside area_range)
std::vector<bool> ground_truth_ignores;
// Marks whether or not each of D instances was ignored from evaluation (e.g.,
// because it's outside aRng)
std::vector<bool> detection_ignores;
};
template <class T>
using ImageCategoryInstances = std::vector<std::vector<std::vector<T>>>;
// C++ implementation of COCO API cocoeval.py::COCOeval.evaluateImg(). For each
// combination of image, category, area range settings, and IOU thresholds to
// evaluate, it matches detected instances to ground truth instances and stores
// the results into a vector of ImageEvaluation results, which will be
// interpreted by the COCOeval::Accumulate() function to produce precion-recall
// curves. The parameters of nested vectors have the following semantics:
// image_category_ious[i][c][d][g] is the intersection over union of the d'th
// detected instance and g'th ground truth instance of
// category category_ids[c] in image image_ids[i]
// image_category_ground_truth_instances[i][c] is a vector of ground truth
// instances in image image_ids[i] of category category_ids[c]
// image_category_detection_instances[i][c] is a vector of detected
// instances in image image_ids[i] of category category_ids[c]
std::vector<ImageEvaluation> EvaluateImages(
const std::vector<std::array<double, 2>>& area_ranges, // vector of 2-tuples
int max_detections,
const std::vector<double>& iou_thresholds,
const ImageCategoryInstances<std::vector<double>>& image_category_ious,
const ImageCategoryInstances<InstanceAnnotation>&
image_category_ground_truth_instances,
const ImageCategoryInstances<InstanceAnnotation>&
image_category_detection_instances);
// C++ implementation of COCOeval.accumulate(), which generates precision
// recall curves for each set of category, IOU threshold, detection area range,
// and max number of detections parameters. It is assumed that the parameter
// evaluations is the return value of the functon COCOeval::EvaluateImages(),
// which was called with the same parameter settings params
py::dict Accumulate(
const py::object& params,
const std::vector<ImageEvaluation>& evalutations);
} // namespace COCOeval
} // namespace detectron2
// Copyright (c) Facebook, Inc. and its affiliates.
#include <cuda_runtime_api.h>
namespace detectron2 {
int get_cudart_version() {
// Not a ROCM platform: Either HIP is not used, or
// it is used, but platform is not ROCM (i.e. it is CUDA)
#if !defined(__HIP_PLATFORM_AMD__)
return CUDART_VERSION;
#else
int version = 0;
#if HIP_VERSION_MAJOR != 0
// Create a convention similar to that of CUDA, as assumed by other
// parts of the code.
version = HIP_VERSION_MINOR;
version += (HIP_VERSION_MAJOR * 100);
#else
hipRuntimeGetVersion(&version);
#endif
return version;
#endif
}
} // namespace detectron2
// Copyright (c) Facebook, Inc. and its affiliates.
#pragma once
#include <torch/types.h>
namespace detectron2 {
#if defined(WITH_CUDA) || defined(WITH_HIP)
int deform_conv_forward_cuda(
at::Tensor input,
at::Tensor weight,
at::Tensor offset,
at::Tensor output,
at::Tensor columns,
at::Tensor ones,
int kW,
int kH,
int dW,
int dH,
int padW,
int padH,
int dilationW,
int dilationH,
int group,
int deformable_group,
int im2col_step);
int deform_conv_backward_input_cuda(
at::Tensor input,
at::Tensor offset,
at::Tensor gradOutput,
at::Tensor gradInput,
at::Tensor gradOffset,
at::Tensor weight,
at::Tensor columns,
int kW,
int kH,
int dW,
int dH,
int padW,
int padH,
int dilationW,
int dilationH,
int group,
int deformable_group,
int im2col_step);
int deform_conv_backward_parameters_cuda(
at::Tensor input,
at::Tensor offset,
at::Tensor gradOutput,
at::Tensor gradWeight, // at::Tensor gradBias,
at::Tensor columns,
at::Tensor ones,
int kW,
int kH,
int dW,
int dH,
int padW,
int padH,
int dilationW,
int dilationH,
int group,
int deformable_group,
float scale,
int im2col_step);
void modulated_deform_conv_cuda_forward(
at::Tensor input,
at::Tensor weight,
at::Tensor bias,
at::Tensor ones,
at::Tensor offset,
at::Tensor mask,
at::Tensor output,
at::Tensor columns,
int kernel_h,
int kernel_w,
const int stride_h,
const int stride_w,
const int pad_h,
const int pad_w,
const int dilation_h,
const int dilation_w,
const int group,
const int deformable_group,
const bool with_bias);
void modulated_deform_conv_cuda_backward(
at::Tensor input,
at::Tensor weight,
at::Tensor bias,
at::Tensor ones,
at::Tensor offset,
at::Tensor mask,
at::Tensor columns,
at::Tensor grad_input,
at::Tensor grad_weight,
at::Tensor grad_bias,
at::Tensor grad_offset,
at::Tensor grad_mask,
at::Tensor grad_output,
int kernel_h,
int kernel_w,
int stride_h,
int stride_w,
int pad_h,
int pad_w,
int dilation_h,
int dilation_w,
int group,
int deformable_group,
const bool with_bias);
#endif
inline int deform_conv_forward(
at::Tensor input,
at::Tensor weight,
at::Tensor offset,
at::Tensor output,
at::Tensor columns,
at::Tensor ones,
int kW,
int kH,
int dW,
int dH,
int padW,
int padH,
int dilationW,
int dilationH,
int group,
int deformable_group,
int im2col_step) {
if (input.is_cuda()) {
#if defined(WITH_CUDA) || defined(WITH_HIP)
TORCH_CHECK(weight.is_cuda(), "weight tensor is not on GPU!");
TORCH_CHECK(offset.is_cuda(), "offset tensor is not on GPU!");
return deform_conv_forward_cuda(
input,
weight,
offset,
output,
columns,
ones,
kW,
kH,
dW,
dH,
padW,
padH,
dilationW,
dilationH,
group,
deformable_group,
im2col_step);
#else
AT_ERROR("Detectron2 is not compiled with GPU support!");
#endif
}
AT_ERROR("This operator is not implemented on CPU");
}
inline int deform_conv_backward_input(
at::Tensor input,
at::Tensor offset,
at::Tensor gradOutput,
at::Tensor gradInput,
at::Tensor gradOffset,
at::Tensor weight,
at::Tensor columns,
int kW,
int kH,
int dW,
int dH,
int padW,
int padH,
int dilationW,
int dilationH,
int group,
int deformable_group,
int im2col_step) {
if (gradOutput.is_cuda()) {
#if defined(WITH_CUDA) || defined(WITH_HIP)
TORCH_CHECK(input.is_cuda(), "input tensor is not on GPU!");
TORCH_CHECK(weight.is_cuda(), "weight tensor is not on GPU!");
TORCH_CHECK(offset.is_cuda(), "offset tensor is not on GPU!");
return deform_conv_backward_input_cuda(
input,
offset,
gradOutput,
gradInput,
gradOffset,
weight,
columns,
kW,
kH,
dW,
dH,
padW,
padH,
dilationW,
dilationH,
group,
deformable_group,
im2col_step);
#else
AT_ERROR("Detectron2 is not compiled with GPU support!");
#endif
}
AT_ERROR("This operator is not implemented on CPU");
}
inline int deform_conv_backward_filter(
at::Tensor input,
at::Tensor offset,
at::Tensor gradOutput,
at::Tensor gradWeight, // at::Tensor gradBias,
at::Tensor columns,
at::Tensor ones,
int kW,
int kH,
int dW,
int dH,
int padW,
int padH,
int dilationW,
int dilationH,
int group,
int deformable_group,
float scale,
int im2col_step) {
if (gradOutput.is_cuda()) {
#if defined(WITH_CUDA) || defined(WITH_HIP)
TORCH_CHECK(input.is_cuda(), "input tensor is not on GPU!");
TORCH_CHECK(offset.is_cuda(), "offset tensor is not on GPU!");
return deform_conv_backward_parameters_cuda(
input,
offset,
gradOutput,
gradWeight,
columns,
ones,
kW,
kH,
dW,
dH,
padW,
padH,
dilationW,
dilationH,
group,
deformable_group,
scale,
im2col_step);
#else
AT_ERROR("Detectron2 is not compiled with GPU support!");
#endif
}
AT_ERROR("This operator is not implemented on CPU");
}
inline void modulated_deform_conv_forward(
at::Tensor input,
at::Tensor weight,
at::Tensor bias,
at::Tensor ones,
at::Tensor offset,
at::Tensor mask,
at::Tensor output,
at::Tensor columns,
int kernel_h,
int kernel_w,
const int stride_h,
const int stride_w,
const int pad_h,
const int pad_w,
const int dilation_h,
const int dilation_w,
const int group,
const int deformable_group,
const bool with_bias) {
if (input.is_cuda()) {
#if defined(WITH_CUDA) || defined(WITH_HIP)
TORCH_CHECK(weight.is_cuda(), "weight tensor is not on GPU!");
TORCH_CHECK(bias.is_cuda(), "bias tensor is not on GPU!");
TORCH_CHECK(offset.is_cuda(), "offset tensor is not on GPU!");
return modulated_deform_conv_cuda_forward(
input,
weight,
bias,
ones,
offset,
mask,
output,
columns,
kernel_h,
kernel_w,
stride_h,
stride_w,
pad_h,
pad_w,
dilation_h,
dilation_w,
group,
deformable_group,
with_bias);
#else
AT_ERROR("Detectron2 is not compiled with GPU support!");
#endif
}
AT_ERROR("This operator is not implemented on CPU");
}
inline void modulated_deform_conv_backward(
at::Tensor input,
at::Tensor weight,
at::Tensor bias,
at::Tensor ones,
at::Tensor offset,
at::Tensor mask,
at::Tensor columns,
at::Tensor grad_input,
at::Tensor grad_weight,
at::Tensor grad_bias,
at::Tensor grad_offset,
at::Tensor grad_mask,
at::Tensor grad_output,
int kernel_h,
int kernel_w,
int stride_h,
int stride_w,
int pad_h,
int pad_w,
int dilation_h,
int dilation_w,
int group,
int deformable_group,
const bool with_bias) {
if (grad_output.is_cuda()) {
#if defined(WITH_CUDA) || defined(WITH_HIP)
TORCH_CHECK(input.is_cuda(), "input tensor is not on GPU!");
TORCH_CHECK(weight.is_cuda(), "weight tensor is not on GPU!");
TORCH_CHECK(bias.is_cuda(), "bias tensor is not on GPU!");
TORCH_CHECK(offset.is_cuda(), "offset tensor is not on GPU!");
return modulated_deform_conv_cuda_backward(
input,
weight,
bias,
ones,
offset,
mask,
columns,
grad_input,
grad_weight,
grad_bias,
grad_offset,
grad_mask,
grad_output,
kernel_h,
kernel_w,
stride_h,
stride_w,
pad_h,
pad_w,
dilation_h,
dilation_w,
group,
deformable_group,
with_bias);
#else
AT_ERROR("Detectron2 is not compiled with GPU support!");
#endif
}
AT_ERROR("This operator is not implemented on CPU");
}
} // namespace detectron2
// Copyright (c) Facebook, Inc. and its affiliates.
// modified from
// https://github.com/open-mmlab/mmdetection/blob/master/mmdet/ops/dcn/src/deform_conv_cuda.cpp
// Original license: Apache 2.0
// modify from
// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda.c
// Original license: Apache 2.0
#include <torch/types.h>
#include "deform_conv.h"
#include <cmath>
#include <vector>
namespace detectron2 {
void deformable_im2col(
const at::Tensor data_im,
const at::Tensor data_offset,
const int channels,
const int height,
const int width,
const int ksize_h,
const int ksize_w,
const int pad_h,
const int pad_w,
const int stride_h,
const int stride_w,
const int dilation_h,
const int dilation_w,
const int parallel_imgs,
const int deformable_group,
at::Tensor data_col);
void deformable_col2im(
const at::Tensor data_col,
const at::Tensor data_offset,
const int channels,
const int height,
const int width,
const int ksize_h,
const int ksize_w,
const int pad_h,
const int pad_w,
const int stride_h,
const int stride_w,
const int dilation_h,
const int dilation_w,
const int parallel_imgs,
const int deformable_group,
at::Tensor grad_im);
void deformable_col2im_coord(
const at::Tensor data_col,
const at::Tensor data_im,
const at::Tensor data_offset,
const int channels,
const int height,
const int width,
const int ksize_h,
const int ksize_w,
const int pad_h,
const int pad_w,
const int stride_h,
const int stride_w,
const int dilation_h,
const int dilation_w,
const int parallel_imgs,
const int deformable_group,
at::Tensor grad_offset);
void modulated_deformable_im2col_cuda(
const at::Tensor data_im,
const at::Tensor data_offset,
const at::Tensor data_mask,
const int batch_size,
const int channels,
const int height_im,
const int width_im,
const int height_col,
const int width_col,
const int kernel_h,
const int kenerl_w,
const int pad_h,
const int pad_w,
const int stride_h,
const int stride_w,
const int dilation_h,
const int dilation_w,
const int deformable_group,
at::Tensor data_col);
void modulated_deformable_col2im_cuda(
const at::Tensor data_col,
const at::Tensor data_offset,
const at::Tensor data_mask,
const int batch_size,
const int channels,
const int height_im,
const int width_im,
const int height_col,
const int width_col,
const int kernel_h,
const int kenerl_w,
const int pad_h,
const int pad_w,
const int stride_h,
const int stride_w,
const int dilation_h,
const int dilation_w,
const int deformable_group,
at::Tensor grad_im);
void modulated_deformable_col2im_coord_cuda(
const at::Tensor data_col,
const at::Tensor data_im,
const at::Tensor data_offset,
const at::Tensor data_mask,
const int batch_size,
const int channels,
const int height_im,
const int width_im,
const int height_col,
const int width_col,
const int kernel_h,
const int kenerl_w,
const int pad_h,
const int pad_w,
const int stride_h,
const int stride_w,
const int dilation_h,
const int dilation_w,
const int deformable_group,
at::Tensor grad_offset,
at::Tensor grad_mask);
void shape_check(
at::Tensor input,
at::Tensor offset,
at::Tensor* gradOutput,
at::Tensor weight,
int kH,
int kW,
int dH,
int dW,
int padH,
int padW,
int dilationH,
int dilationW,
int group,
int deformable_group) {
TORCH_CHECK(
weight.ndimension() == 4,
"4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, "
"but got: %s",
weight.ndimension());
TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
TORCH_CHECK(
kW > 0 && kH > 0,
"kernel size should be greater than zero, but got kH: %d kW: %d",
kH,
kW);
TORCH_CHECK(
(weight.size(2) == kH && weight.size(3) == kW),
"kernel size should be consistent with weight, ",
"but got kH: %d kW: %d weight.size(2): %d, weight.size(3): %d",
kH,
kW,
weight.size(2),
weight.size(3));
TORCH_CHECK(
dW > 0 && dH > 0,
"stride should be greater than zero, but got dH: %d dW: %d",
dH,
dW);
TORCH_CHECK(
dilationW > 0 && dilationH > 0,
"dilation should be greater than 0, but got dilationH: %d dilationW: %d",
dilationH,
dilationW);
int ndim = input.ndimension();
int dimf = 0;
int dimh = 1;
int dimw = 2;
if (ndim == 4) {
dimf++;
dimh++;
dimw++;
}
TORCH_CHECK(
ndim == 3 || ndim == 4,
"3D or 4D input tensor expected but got: %s",
ndim);
long nInputPlane = weight.size(1) * group;
long inputHeight = input.size(dimh);
long inputWidth = input.size(dimw);
long nOutputPlane = weight.size(0);
long outputHeight =
(inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
long outputWidth =
(inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
TORCH_CHECK(
nInputPlane % deformable_group == 0,
"input channels must divide deformable group size");
if (outputWidth < 1 || outputHeight < 1)
AT_ERROR(
"Given input size: (%ld x %ld x %ld). "
"Calculated output size: (%ld x %ld x %ld). Output size is too small",
nInputPlane,
inputHeight,
inputWidth,
nOutputPlane,
outputHeight,
outputWidth);
TORCH_CHECK(
input.size(1) == nInputPlane,
"invalid number of input planes, expected: %d, but got: %d",
nInputPlane,
input.size(1));
TORCH_CHECK(
(inputHeight + 2 * padH >= kH && inputWidth + 2 * padW >= kW),
"input image is smaller than kernel");
TORCH_CHECK(
(offset.size(2) == outputHeight && offset.size(3) == outputWidth),
"invalid spatial size of offset, expected height: %d width: %d, but "
"got height: %d width: %d",
outputHeight,
outputWidth,
offset.size(2),
offset.size(3));
TORCH_CHECK(
(offset.size(1) == deformable_group * 2 * kH * kW),
"invalid number of channels of offset");
if (gradOutput != NULL) {
TORCH_CHECK(
gradOutput->size(dimf) == nOutputPlane,
"invalid number of gradOutput planes, expected: %d, but got: %d",
nOutputPlane,
gradOutput->size(dimf));
TORCH_CHECK(
(gradOutput->size(dimh) == outputHeight &&
gradOutput->size(dimw) == outputWidth),
"invalid size of gradOutput, expected height: %d width: %d , but "
"got height: %d width: %d",
outputHeight,
outputWidth,
gradOutput->size(dimh),
gradOutput->size(dimw));
}
}
int deform_conv_forward_cuda(
at::Tensor input,
at::Tensor weight,
at::Tensor offset,
at::Tensor output,
at::Tensor columns,
at::Tensor ones,
int kW,
int kH,
int dW,
int dH,
int padW,
int padH,
int dilationW,
int dilationH,
int group,
int deformable_group,
int im2col_step) {
// todo: resize columns to include im2col: done
// todo: add im2col_step as input
// todo: add new output buffer and transpose it to output (or directly
// transpose output) todo: possibly change data indexing because of
// parallel_imgs
shape_check(
input,
offset,
NULL,
weight,
kH,
kW,
dH,
dW,
padH,
padW,
dilationH,
dilationW,
group,
deformable_group);
input = input.contiguous();
offset = offset.contiguous();
weight = weight.contiguous();
int batch = 1;
if (input.ndimension() == 3) {
// Force batch
batch = 0;
input.unsqueeze_(0);
offset.unsqueeze_(0);
}
// todo: assert batchsize dividable by im2col_step
long batchSize = input.size(0);
long nInputPlane = input.size(1);
long inputHeight = input.size(2);
long inputWidth = input.size(3);
long nOutputPlane = weight.size(0);
long outputWidth =
(inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
long outputHeight =
(inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
output = output.view(
{batchSize / im2col_step,
im2col_step,
nOutputPlane,
outputHeight,
outputWidth});
columns = at::zeros(
{nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
input.options());
if (ones.ndimension() != 2 ||
ones.size(0) * ones.size(1) < outputHeight * outputWidth) {
ones = at::ones({outputHeight, outputWidth}, input.options());
}
input = input.view(
{batchSize / im2col_step,
im2col_step,
nInputPlane,
inputHeight,
inputWidth});
offset = offset.view(
{batchSize / im2col_step,
im2col_step,
deformable_group * 2 * kH * kW,
outputHeight,
outputWidth});
at::Tensor output_buffer = at::zeros(
{batchSize / im2col_step,
nOutputPlane,
im2col_step * outputHeight,
outputWidth},
output.options());
output_buffer = output_buffer.view(
{output_buffer.size(0),
group,
output_buffer.size(1) / group,
output_buffer.size(2),
output_buffer.size(3)});
for (int elt = 0; elt < batchSize / im2col_step; elt++) {
deformable_im2col(
input[elt],
offset[elt],
nInputPlane,
inputHeight,
inputWidth,
kH,
kW,
padH,
padW,
dH,
dW,
dilationH,
dilationW,
im2col_step,
deformable_group,
columns);
columns = columns.view({group, columns.size(0) / group, columns.size(1)});
weight = weight.view(
{group,
weight.size(0) / group,
weight.size(1),
weight.size(2),
weight.size(3)});
for (int g = 0; g < group; g++) {
output_buffer[elt][g] = output_buffer[elt][g]
.flatten(1)
.addmm_(weight[g].flatten(1), columns[g])
.view_as(output_buffer[elt][g]);
}
}
output_buffer = output_buffer.view(
{output_buffer.size(0),
output_buffer.size(1) * output_buffer.size(2),
output_buffer.size(3),
output_buffer.size(4)});
output_buffer = output_buffer.view(
{batchSize / im2col_step,
nOutputPlane,
im2col_step,
outputHeight,
outputWidth});
output_buffer.transpose_(1, 2);
output.copy_(output_buffer);
output = output.view({batchSize, nOutputPlane, outputHeight, outputWidth});
input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
offset = offset.view(
{batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
if (batch == 0) {
output = output.view({nOutputPlane, outputHeight, outputWidth});
input = input.view({nInputPlane, inputHeight, inputWidth});
offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
}
return 1;
}
int deform_conv_backward_input_cuda(
at::Tensor input,
at::Tensor offset,
at::Tensor gradOutput,
at::Tensor gradInput,
at::Tensor gradOffset,
at::Tensor weight,
at::Tensor columns,
int kW,
int kH,
int dW,
int dH,
int padW,
int padH,
int dilationW,
int dilationH,
int group,
int deformable_group,
int im2col_step) {
shape_check(
input,
offset,
&gradOutput,
weight,
kH,
kW,
dH,
dW,
padH,
padW,
dilationH,
dilationW,
group,
deformable_group);
input = input.contiguous();
offset = offset.contiguous();
gradOutput = gradOutput.contiguous();
weight = weight.contiguous();
int batch = 1;
if (input.ndimension() == 3) {
// Force batch
batch = 0;
input = input.view({1, input.size(0), input.size(1), input.size(2)});
offset = offset.view({1, offset.size(0), offset.size(1), offset.size(2)});
gradOutput = gradOutput.view(
{1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
}
long batchSize = input.size(0);
long nInputPlane = input.size(1);
long inputHeight = input.size(2);
long inputWidth = input.size(3);
long nOutputPlane = weight.size(0);
long outputWidth =
(inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
long outputHeight =
(inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
TORCH_CHECK((offset.size(0) == batchSize), 3, "invalid batch size of offset");
gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
columns = at::zeros(
{nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
input.options());
// change order of grad output
gradOutput = gradOutput.view(
{batchSize / im2col_step,
im2col_step,
nOutputPlane,
outputHeight,
outputWidth});
gradOutput.transpose_(1, 2);
gradInput = gradInput.view(
{batchSize / im2col_step,
im2col_step,
nInputPlane,
inputHeight,
inputWidth});
input = input.view(
{batchSize / im2col_step,
im2col_step,
nInputPlane,
inputHeight,
inputWidth});
gradOffset = gradOffset.view(
{batchSize / im2col_step,
im2col_step,
deformable_group * 2 * kH * kW,
outputHeight,
outputWidth});
offset = offset.view(
{batchSize / im2col_step,
im2col_step,
deformable_group * 2 * kH * kW,
outputHeight,
outputWidth});
for (int elt = 0; elt < batchSize / im2col_step; elt++) {
// divide into groups
columns = columns.view({group, columns.size(0) / group, columns.size(1)});
weight = weight.view(
{group,
weight.size(0) / group,
weight.size(1),
weight.size(2),
weight.size(3)});
gradOutput = gradOutput.view(
{gradOutput.size(0),
group,
gradOutput.size(1) / group,
gradOutput.size(2),
gradOutput.size(3),
gradOutput.size(4)});
for (int g = 0; g < group; g++) {
columns[g] = columns[g].addmm_(
weight[g].flatten(1).transpose(0, 1),
gradOutput[elt][g].flatten(1),
0.0f,
1.0f);
}
columns =
columns.view({columns.size(0) * columns.size(1), columns.size(2)});
gradOutput = gradOutput.view(
{gradOutput.size(0),
gradOutput.size(1) * gradOutput.size(2),
gradOutput.size(3),
gradOutput.size(4),
gradOutput.size(5)});
deformable_col2im_coord(
columns,
input[elt],
offset[elt],
nInputPlane,
inputHeight,
inputWidth,
kH,
kW,
padH,
padW,
dH,
dW,
dilationH,
dilationW,
im2col_step,
deformable_group,
gradOffset[elt]);
deformable_col2im(
columns,
offset[elt],
nInputPlane,
inputHeight,
inputWidth,
kH,
kW,
padH,
padW,
dH,
dW,
dilationH,
dilationW,
im2col_step,
deformable_group,
gradInput[elt]);
}
gradOutput.transpose_(1, 2);
gradOutput =
gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
gradOffset = gradOffset.view(
{batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
offset = offset.view(
{batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
if (batch == 0) {
gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
input = input.view({nInputPlane, inputHeight, inputWidth});
gradInput = gradInput.view({nInputPlane, inputHeight, inputWidth});
offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
gradOffset =
gradOffset.view({offset.size(1), offset.size(2), offset.size(3)});
}
return 1;
}
int deform_conv_backward_parameters_cuda(
at::Tensor input,
at::Tensor offset,
at::Tensor gradOutput,
at::Tensor gradWeight, // at::Tensor gradBias,
at::Tensor columns,
at::Tensor ones,
int kW,
int kH,
int dW,
int dH,
int padW,
int padH,
int dilationW,
int dilationH,
int group,
int deformable_group,
float scale,
int im2col_step) {
// todo: transpose and reshape outGrad
// todo: reshape columns
// todo: add im2col_step as input
shape_check(
input,
offset,
&gradOutput,
gradWeight,
kH,
kW,
dH,
dW,
padH,
padW,
dilationH,
dilationW,
group,
deformable_group);
input = input.contiguous();
offset = offset.contiguous();
gradOutput = gradOutput.contiguous();
int batch = 1;
if (input.ndimension() == 3) {
// Force batch
batch = 0;
input = input.view(
at::IntList({1, input.size(0), input.size(1), input.size(2)}));
gradOutput = gradOutput.view(
{1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
}
long batchSize = input.size(0);
long nInputPlane = input.size(1);
long inputHeight = input.size(2);
long inputWidth = input.size(3);
long nOutputPlane = gradWeight.size(0);
long outputWidth =
(inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
long outputHeight =
(inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
columns = at::zeros(
{nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
input.options());
gradOutput = gradOutput.view(
{batchSize / im2col_step,
im2col_step,
nOutputPlane,
outputHeight,
outputWidth});
gradOutput.transpose_(1, 2);
at::Tensor gradOutputBuffer = at::zeros_like(gradOutput);
gradOutputBuffer = gradOutputBuffer.view(
{batchSize / im2col_step,
nOutputPlane,
im2col_step,
outputHeight,
outputWidth});
gradOutputBuffer.copy_(gradOutput);
// gradOutput is not contiguous, so we do reshape (instead of view) next
gradOutputBuffer = gradOutputBuffer.reshape(
{batchSize / im2col_step,
nOutputPlane,
im2col_step * outputHeight,
outputWidth});
gradOutput.transpose_(1, 2);
gradOutput =
gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
input = input.view(
{batchSize / im2col_step,
im2col_step,
nInputPlane,
inputHeight,
inputWidth});
offset = offset.view(
{batchSize / im2col_step,
im2col_step,
deformable_group * 2 * kH * kW,
outputHeight,
outputWidth});
for (int elt = 0; elt < batchSize / im2col_step; elt++) {
deformable_im2col(
input[elt],
offset[elt],
nInputPlane,
inputHeight,
inputWidth,
kH,
kW,
padH,
padW,
dH,
dW,
dilationH,
dilationW,
im2col_step,
deformable_group,
columns);
// divide into group
gradOutputBuffer = gradOutputBuffer.view(
{gradOutputBuffer.size(0),
group,
gradOutputBuffer.size(1) / group,
gradOutputBuffer.size(2),
gradOutputBuffer.size(3)});
columns = columns.view({group, columns.size(0) / group, columns.size(1)});
gradWeight = gradWeight.view(
{group,
gradWeight.size(0) / group,
gradWeight.size(1),
gradWeight.size(2),
gradWeight.size(3)});
for (int g = 0; g < group; g++) {
gradWeight[g] = gradWeight[g]
.flatten(1)
.addmm_(
gradOutputBuffer[elt][g].flatten(1),
columns[g].transpose(1, 0),
1.0,
scale)
.view_as(gradWeight[g]);
}
gradOutputBuffer = gradOutputBuffer.view(
{gradOutputBuffer.size(0),
gradOutputBuffer.size(1) * gradOutputBuffer.size(2),
gradOutputBuffer.size(3),
gradOutputBuffer.size(4)});
columns =
columns.view({columns.size(0) * columns.size(1), columns.size(2)});
gradWeight = gradWeight.view(
{gradWeight.size(0) * gradWeight.size(1),
gradWeight.size(2),
gradWeight.size(3),
gradWeight.size(4)});
}
input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
offset = offset.view(
{batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
if (batch == 0) {
gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
input = input.view({nInputPlane, inputHeight, inputWidth});
}
return 1;
}
void modulated_deform_conv_cuda_forward(
at::Tensor input,
at::Tensor weight,
at::Tensor bias,
at::Tensor ones,
at::Tensor offset,
at::Tensor mask,
at::Tensor output,
at::Tensor columns,
int kernel_h,
int kernel_w,
const int stride_h,
const int stride_w,
const int pad_h,
const int pad_w,
const int dilation_h,
const int dilation_w,
const int group,
const int deformable_group,
const bool with_bias) {
shape_check(
input,
offset,
NULL,
weight,
kernel_h,
kernel_w,
stride_h,
stride_w,
pad_h,
pad_w,
dilation_h,
dilation_w,
group,
deformable_group);
TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
const int batch = input.size(0);
const int channels = input.size(1);
const int height = input.size(2);
const int width = input.size(3);
const int channels_out = weight.size(0);
const int channels_kernel = weight.size(1);
const int kernel_h_ = weight.size(2);
const int kernel_w_ = weight.size(3);
if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
AT_ERROR(
"Input shape and kernel shape wont match: (%d x %d vs %d x %d).",
kernel_h_,
kernel_w,
kernel_h_,
kernel_w_);
if (channels != channels_kernel * group)
AT_ERROR(
"Input shape and kernel channels wont match: (%d vs %d).",
channels,
channels_kernel * group);
const int height_out =
(height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
const int width_out =
(width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
// mask shape check
TORCH_CHECK(
(mask.size(2) == height_out && mask.size(3) == width_out),
"invalid spatial size of mask, expected height: %d width: %d, but "
"got height: %d width: %d",
height_out,
width_out,
mask.size(2),
mask.size(3));
TORCH_CHECK(
(mask.size(1) == deformable_group * kernel_h * kernel_w),
"invalid number of channels of mask");
if (ones.ndimension() != 2 ||
ones.size(0) * ones.size(1) < height_out * width_out) {
// Resize plane and fill with ones...
ones = at::ones({height_out, width_out}, input.options());
}
// resize output
output = output.view({batch, channels_out, height_out, width_out}).zero_();
// resize temporary columns
columns = at::zeros(
{channels * kernel_h * kernel_w, 1 * height_out * width_out},
input.options());
output = output.view(
{output.size(0),
group,
output.size(1) / group,
output.size(2),
output.size(3)});
for (int b = 0; b < batch; b++) {
modulated_deformable_im2col_cuda(
input[b],
offset[b],
mask[b],
1,
channels,
height,
width,
height_out,
width_out,
kernel_h,
kernel_w,
pad_h,
pad_w,
stride_h,
stride_w,
dilation_h,
dilation_w,
deformable_group,
columns);
// divide into group
weight = weight.view(
{group,
weight.size(0) / group,
weight.size(1),
weight.size(2),
weight.size(3)});
columns = columns.view({group, columns.size(0) / group, columns.size(1)});
for (int g = 0; g < group; g++) {
output[b][g] = output[b][g]
.flatten(1)
.addmm_(weight[g].flatten(1), columns[g])
.view_as(output[b][g]);
}
weight = weight.view(
{weight.size(0) * weight.size(1),
weight.size(2),
weight.size(3),
weight.size(4)});
columns =
columns.view({columns.size(0) * columns.size(1), columns.size(2)});
}
output = output.view(
{output.size(0),
output.size(1) * output.size(2),
output.size(3),
output.size(4)});
if (with_bias) {
output += bias.view({1, bias.size(0), 1, 1});
}
}
void modulated_deform_conv_cuda_backward(
at::Tensor input,
at::Tensor weight,
at::Tensor bias,
at::Tensor ones,
at::Tensor offset,
at::Tensor mask,
at::Tensor columns,
at::Tensor grad_input,
at::Tensor grad_weight,
at::Tensor grad_bias,
at::Tensor grad_offset,
at::Tensor grad_mask,
at::Tensor grad_output,
int kernel_h,
int kernel_w,
int stride_h,
int stride_w,
int pad_h,
int pad_w,
int dilation_h,
int dilation_w,
int group,
int deformable_group,
const bool with_bias) {
shape_check(
input,
offset,
&grad_output,
weight,
kernel_h,
kernel_w,
stride_h,
stride_w,
pad_h,
pad_w,
dilation_h,
dilation_w,
group,
deformable_group);
TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
const int batch = input.size(0);
const int channels = input.size(1);
const int height = input.size(2);
const int width = input.size(3);
const int channels_kernel = weight.size(1);
const int kernel_h_ = weight.size(2);
const int kernel_w_ = weight.size(3);
if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
AT_ERROR(
"Input shape and kernel shape wont match: (%d x %d vs %d x %d).",
kernel_h_,
kernel_w,
kernel_h_,
kernel_w_);
if (channels != channels_kernel * group)
AT_ERROR(
"Input shape and kernel channels wont match: (%d vs %d).",
channels,
channels_kernel * group);
const int height_out =
(height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
const int width_out =
(width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
// mask shape check
TORCH_CHECK(
(mask.size(2) == height_out && mask.size(3) == width_out),
"invalid spatial size of mask, expected height: %d width: %d, but "
"got height: %d width: %d",
height_out,
width_out,
mask.size(2),
mask.size(3));
TORCH_CHECK(
(mask.size(1) == deformable_group * kernel_h * kernel_w),
"invalid number of channels of mask");
if (ones.ndimension() != 2 ||
ones.size(0) * ones.size(1) < height_out * width_out) {
// Resize plane and fill with ones...
ones = at::ones({height_out, width_out}, input.options());
}
grad_input = grad_input.view({batch, channels, height, width});
columns = at::zeros(
{channels * kernel_h * kernel_w, height_out * width_out},
input.options());
grad_output = grad_output.view(
{grad_output.size(0),
group,
grad_output.size(1) / group,
grad_output.size(2),
grad_output.size(3)});
for (int b = 0; b < batch; b++) {
// divide int group
columns = columns.view({group, columns.size(0) / group, columns.size(1)});
weight = weight.view(
{group,
weight.size(0) / group,
weight.size(1),
weight.size(2),
weight.size(3)});
for (int g = 0; g < group; g++) {
columns[g].addmm_(
weight[g].flatten(1).transpose(0, 1),
grad_output[b][g].flatten(1),
0.0f,
1.0f);
}
columns =
columns.view({columns.size(0) * columns.size(1), columns.size(2)});
weight = weight.view(
{weight.size(0) * weight.size(1),
weight.size(2),
weight.size(3),
weight.size(4)});
// gradient w.r.t. input coordinate data
modulated_deformable_col2im_coord_cuda(
columns,
input[b],
offset[b],
mask[b],
1,
channels,
height,
width,
height_out,
width_out,
kernel_h,
kernel_w,
pad_h,
pad_w,
stride_h,
stride_w,
dilation_h,
dilation_w,
deformable_group,
grad_offset[b],
grad_mask[b]);
// gradient w.r.t. input data
modulated_deformable_col2im_cuda(
columns,
offset[b],
mask[b],
1,
channels,
height,
width,
height_out,
width_out,
kernel_h,
kernel_w,
pad_h,
pad_w,
stride_h,
stride_w,
dilation_h,
dilation_w,
deformable_group,
grad_input[b]);
// gradient w.r.t. weight, dWeight should accumulate across the batch and
// group
modulated_deformable_im2col_cuda(
input[b],
offset[b],
mask[b],
1,
channels,
height,
width,
height_out,
width_out,
kernel_h,
kernel_w,
pad_h,
pad_w,
stride_h,
stride_w,
dilation_h,
dilation_w,
deformable_group,
columns);
columns = columns.view({group, columns.size(0) / group, columns.size(1)});
grad_weight = grad_weight.view(
{group,
grad_weight.size(0) / group,
grad_weight.size(1),
grad_weight.size(2),
grad_weight.size(3)});
if (with_bias)
grad_bias = grad_bias.view({group, grad_bias.size(0) / group});
for (int g = 0; g < group; g++) {
grad_weight[g] =
grad_weight[g]
.flatten(1)
.addmm_(grad_output[b][g].flatten(1), columns[g].transpose(0, 1))
.view_as(grad_weight[g]);
if (with_bias) {
grad_bias[g] =
grad_bias[g]
.view({-1, 1})
.addmm_(grad_output[b][g].flatten(1), ones.view({-1, 1}))
.view(-1);
}
}
columns =
columns.view({columns.size(0) * columns.size(1), columns.size(2)});
grad_weight = grad_weight.view(
{grad_weight.size(0) * grad_weight.size(1),
grad_weight.size(2),
grad_weight.size(3),
grad_weight.size(4)});
if (with_bias)
grad_bias = grad_bias.view({grad_bias.size(0) * grad_bias.size(1)});
}
grad_output = grad_output.view(
{grad_output.size(0) * grad_output.size(1),
grad_output.size(2),
grad_output.size(3),
grad_output.size(4)});
}
} // namespace detectron2
// Copyright (c) Facebook, Inc. and its affiliates.
// modified from
// https://github.com/open-mmlab/mmdetection/blob/master/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu
// Original license: Apache 2.0
// clang-format off
// modify from
// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu
/*!
******************* BEGIN Caffe Copyright Notice and Disclaimer *****************
*
* COPYRIGHT
*
* All contributions by the University of California:
* Copyright (c) 2014-2017 The Regents of the University of California (Regents)
* All rights reserved.
*
* All other contributions:
* Copyright (c) 2014-2017, the respective contributors
* All rights reserved.
*
* Caffe uses a shared copyright model: each contributor holds copyright over
* their contributions to Caffe. The project versioning records all such
* contribution and copyright details. If a contributor wants to further mark
* their specific copyright on a particular contribution, they should indicate
* their copyright solely in the commit message of the change when it is
* committed.
*
* LICENSE
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
*AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
*IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
*FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
*DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
*SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
*CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
*OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
*OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* CONTRIBUTION AGREEMENT
*
* By contributing to the BVLC/caffe repository through pull-request, comment,
* or otherwise, the contributor releases their content to the
* license and copyright terms herein.
*
***************** END Caffe Copyright Notice and Disclaimer *********************
*
* Copyright (c) 2018 Microsoft
* Licensed under The MIT License [see LICENSE for details]
* \file modulated_deformable_im2col.cuh
* \brief Function definitions of converting an image to
* column matrix based on kernel, padding, dilation, and offset.
* These functions are mainly used in deformable convolution operators.
* \ref: https://arxiv.org/abs/1703.06211
* \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng
*/
#include <ATen/ATen.h>
#include <c10/cuda/CUDAGuard.h>
#include <float.h>
#include <math.h>
#include <stdio.h>
#include <THC/THCAtomics.cuh>
using namespace at;
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
namespace {
const int CUDA_NUM_THREADS = 1024;
const int kMaxGridNum = 65535;
inline int GET_BLOCKS(const int N) {
return std::min(kMaxGridNum, (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS);
}
}
template <typename scalar_t>
__device__ scalar_t deformable_im2col_bilinear(
const scalar_t* bottom_data,
const int data_width,
const int height,
const int width,
scalar_t h,
scalar_t w) {
int h_low = floor(h);
int w_low = floor(w);
int h_high = h_low + 1;
int w_high = w_low + 1;
scalar_t lh = h - h_low;
scalar_t lw = w - w_low;
scalar_t hh = 1 - lh, hw = 1 - lw;
scalar_t v1 = 0;
if (h_low >= 0 && w_low >= 0)
v1 = bottom_data[h_low * data_width + w_low];
scalar_t v2 = 0;
if (h_low >= 0 && w_high <= width - 1)
v2 = bottom_data[h_low * data_width + w_high];
scalar_t v3 = 0;
if (h_high <= height - 1 && w_low >= 0)
v3 = bottom_data[h_high * data_width + w_low];
scalar_t v4 = 0;
if (h_high <= height - 1 && w_high <= width - 1)
v4 = bottom_data[h_high * data_width + w_high];
scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
return val;
}
template <typename scalar_t>
__device__ scalar_t get_gradient_weight(
scalar_t argmax_h,
scalar_t argmax_w,
const int h,
const int w,
const int height,
const int width) {
if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
argmax_w >= width) {
// empty
return 0;
}
int argmax_h_low = floor(argmax_h);
int argmax_w_low = floor(argmax_w);
int argmax_h_high = argmax_h_low + 1;
int argmax_w_high = argmax_w_low + 1;
scalar_t weight = 0;
if (h == argmax_h_low && w == argmax_w_low)
weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
if (h == argmax_h_low && w == argmax_w_high)
weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
if (h == argmax_h_high && w == argmax_w_low)
weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
if (h == argmax_h_high && w == argmax_w_high)
weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
return weight;
}
template <typename scalar_t>
__device__ scalar_t get_coordinate_weight(
scalar_t argmax_h,
scalar_t argmax_w,
const int height,
const int width,
const scalar_t* im_data,
const int data_width,
const int bp_dir) {
if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
argmax_w >= width) {
// empty
return 0;
}
int argmax_h_low = floor(argmax_h);
int argmax_w_low = floor(argmax_w);
int argmax_h_high = argmax_h_low + 1;
int argmax_w_high = argmax_w_low + 1;
scalar_t weight = 0;
if (bp_dir == 0) {
if (argmax_h_low >= 0 && argmax_w_low >= 0)
weight += -1 * (argmax_w_low + 1 - argmax_w) *
im_data[argmax_h_low * data_width + argmax_w_low];
if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
weight += -1 * (argmax_w - argmax_w_low) *
im_data[argmax_h_low * data_width + argmax_w_high];
if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
weight += (argmax_w_low + 1 - argmax_w) *
im_data[argmax_h_high * data_width + argmax_w_low];
if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
weight += (argmax_w - argmax_w_low) *
im_data[argmax_h_high * data_width + argmax_w_high];
} else if (bp_dir == 1) {
if (argmax_h_low >= 0 && argmax_w_low >= 0)
weight += -1 * (argmax_h_low + 1 - argmax_h) *
im_data[argmax_h_low * data_width + argmax_w_low];
if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
weight += (argmax_h_low + 1 - argmax_h) *
im_data[argmax_h_low * data_width + argmax_w_high];
if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
weight += -1 * (argmax_h - argmax_h_low) *
im_data[argmax_h_high * data_width + argmax_w_low];
if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
weight += (argmax_h - argmax_h_low) *
im_data[argmax_h_high * data_width + argmax_w_high];
}
return weight;
}
template <typename scalar_t>
__global__ void deformable_im2col_gpu_kernel(
const int n,
const scalar_t* data_im,
const scalar_t* data_offset,
const int height,
const int width,
const int kernel_h,
const int kernel_w,
const int pad_h,
const int pad_w,
const int stride_h,
const int stride_w,
const int dilation_h,
const int dilation_w,
const int channel_per_deformable_group,
const int batch_size,
const int num_channels,
const int deformable_group,
const int height_col,
const int width_col,
scalar_t* data_col) {
CUDA_KERNEL_LOOP(index, n) {
// index index of output matrix
const int w_col = index % width_col;
const int h_col = (index / width_col) % height_col;
const int b_col = (index / width_col / height_col) % batch_size;
const int c_im = (index / width_col / height_col) / batch_size;
const int c_col = c_im * kernel_h * kernel_w;
// compute deformable group index
const int deformable_group_index = c_im / channel_per_deformable_group;
const int h_in = h_col * stride_h - pad_h;
const int w_in = w_col * stride_w - pad_w;
scalar_t* data_col_ptr = data_col +
((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
// const scalar_t* data_im_ptr = data_im + ((b_col * num_channels + c_im) *
// height + h_in) * width + w_in;
const scalar_t* data_im_ptr =
data_im + (b_col * num_channels + c_im) * height * width;
const scalar_t* data_offset_ptr = data_offset +
(b_col * deformable_group + deformable_group_index) * 2 * kernel_h *
kernel_w * height_col * width_col;
for (int i = 0; i < kernel_h; ++i) {
for (int j = 0; j < kernel_w; ++j) {
const int data_offset_h_ptr =
((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
const int data_offset_w_ptr =
((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
w_col;
const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
scalar_t val = static_cast<scalar_t>(0);
const scalar_t h_im = h_in + i * dilation_h + offset_h;
const scalar_t w_im = w_in + j * dilation_w + offset_w;
if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) {
// const scalar_t map_h = i * dilation_h + offset_h;
// const scalar_t map_w = j * dilation_w + offset_w;
// const int cur_height = height - h_in;
// const int cur_width = width - w_in;
// val = deformable_im2col_bilinear(data_im_ptr, width, cur_height,
// cur_width, map_h, map_w);
val = deformable_im2col_bilinear(
data_im_ptr, width, height, width, h_im, w_im);
}
*data_col_ptr = val;
data_col_ptr += batch_size * height_col * width_col;
}
}
}
}
template <typename scalar_t>
__global__ void deformable_col2im_gpu_kernel(
const int n,
const scalar_t* data_col,
const scalar_t* data_offset,
const int channels,
const int height,
const int width,
const int kernel_h,
const int kernel_w,
const int pad_h,
const int pad_w,
const int stride_h,
const int stride_w,
const int dilation_h,
const int dilation_w,
const int channel_per_deformable_group,
const int batch_size,
const int deformable_group,
const int height_col,
const int width_col,
scalar_t* grad_im) {
CUDA_KERNEL_LOOP(index, n) {
const int j = (index / width_col / height_col / batch_size) % kernel_w;
const int i =
(index / width_col / height_col / batch_size / kernel_w) % kernel_h;
const int c =
index / width_col / height_col / batch_size / kernel_w / kernel_h;
// compute the start and end of the output
const int deformable_group_index = c / channel_per_deformable_group;
int w_out = index % width_col;
int h_out = (index / width_col) % height_col;
int b = (index / width_col / height_col) % batch_size;
int w_in = w_out * stride_w - pad_w;
int h_in = h_out * stride_h - pad_h;
const scalar_t* data_offset_ptr = data_offset +
(b * deformable_group + deformable_group_index) * 2 * kernel_h *
kernel_w * height_col * width_col;
const int data_offset_h_ptr =
((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
const int data_offset_w_ptr =
((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
const scalar_t cur_inv_h_data = h_in + i * dilation_h + offset_h;
const scalar_t cur_inv_w_data = w_in + j * dilation_w + offset_w;
const scalar_t cur_top_grad = data_col[index];
const int cur_h = (int)cur_inv_h_data;
const int cur_w = (int)cur_inv_w_data;
for (int dy = -2; dy <= 2; dy++) {
for (int dx = -2; dx <= 2; dx++) {
if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
abs(cur_inv_w_data - (cur_w + dx)) < 1) {
int cur_bottom_grad_pos =
((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
scalar_t weight = get_gradient_weight(
cur_inv_h_data,
cur_inv_w_data,
cur_h + dy,
cur_w + dx,
height,
width);
atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
}
}
}
}
}
template <typename scalar_t>
__global__ void deformable_col2im_coord_gpu_kernel(
const int n,
const scalar_t* data_col,
const scalar_t* data_im,
const scalar_t* data_offset,
const int channels,
const int height,
const int width,
const int kernel_h,
const int kernel_w,
const int pad_h,
const int pad_w,
const int stride_h,
const int stride_w,
const int dilation_h,
const int dilation_w,
const int channel_per_deformable_group,
const int batch_size,
const int offset_channels,
const int deformable_group,
const int height_col,
const int width_col,
scalar_t* grad_offset) {
CUDA_KERNEL_LOOP(index, n) {
scalar_t val = 0;
int w = index % width_col;
int h = (index / width_col) % height_col;
int c = (index / width_col / height_col) % offset_channels;
int b = (index / width_col / height_col) / offset_channels;
// compute the start and end of the output
const int deformable_group_index = c / (2 * kernel_h * kernel_w);
const int col_step = kernel_h * kernel_w;
int cnt = 0;
const scalar_t* data_col_ptr = data_col +
deformable_group_index * channel_per_deformable_group * batch_size *
width_col * height_col;
const scalar_t* data_im_ptr = data_im +
(b * deformable_group + deformable_group_index) *
channel_per_deformable_group / kernel_h / kernel_w * height * width;
const scalar_t* data_offset_ptr = data_offset +
(b * deformable_group + deformable_group_index) * 2 * kernel_h *
kernel_w * height_col * width_col;
const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;
col_c += col_step) {
const int col_pos =
(((col_c * batch_size + b) * height_col) + h) * width_col + w;
const int bp_dir = offset_c % 2;
int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
int i =
(col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
int w_out = col_pos % width_col;
int h_out = (col_pos / width_col) % height_col;
int w_in = w_out * stride_w - pad_w;
int h_in = h_out * stride_h - pad_h;
const int data_offset_h_ptr =
(((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
const int data_offset_w_ptr =
(((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
w_out);
const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
scalar_t inv_h = h_in + i * dilation_h + offset_h;
scalar_t inv_w = w_in + j * dilation_w + offset_w;
if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) {
inv_h = inv_w = -2;
}
const scalar_t weight = get_coordinate_weight(
inv_h,
inv_w,
height,
width,
data_im_ptr + cnt * height * width,
width,
bp_dir);
val += weight * data_col_ptr[col_pos];
cnt += 1;
}
grad_offset[index] = val;
}
}
namespace detectron2 {
void deformable_im2col(
const at::Tensor data_im,
const at::Tensor data_offset,
const int channels,
const int height,
const int width,
const int ksize_h,
const int ksize_w,
const int pad_h,
const int pad_w,
const int stride_h,
const int stride_w,
const int dilation_h,
const int dilation_w,
const int parallel_imgs,
const int deformable_group,
at::Tensor data_col) {
// num_axes should be smaller than block size
// todo: check parallel_imgs is correctly passed in
int height_col =
(height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
int width_col =
(width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
int num_kernels = channels * height_col * width_col * parallel_imgs;
int channel_per_deformable_group = channels / deformable_group;
at::cuda::CUDAGuard device_guard(data_im.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
data_im.scalar_type(), "deformable_im2col_gpu", ([&] {
const scalar_t* data_im_ = data_im.data_ptr<scalar_t>();
const scalar_t* data_offset_ = data_offset.data_ptr<scalar_t>();
scalar_t* data_col_ = data_col.data_ptr<scalar_t>();
deformable_im2col_gpu_kernel<<<
GET_BLOCKS(num_kernels),
CUDA_NUM_THREADS,
0,
stream>>>(
num_kernels,
data_im_,
data_offset_,
height,
width,
ksize_h,
ksize_w,
pad_h,
pad_w,
stride_h,
stride_w,
dilation_h,
dilation_w,
channel_per_deformable_group,
parallel_imgs,
channels,
deformable_group,
height_col,
width_col,
data_col_);
}));
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("error in deformable_im2col: %s\n", cudaGetErrorString(err));
}
}
void deformable_col2im(
const at::Tensor data_col,
const at::Tensor data_offset,
const int channels,
const int height,
const int width,
const int ksize_h,
const int ksize_w,
const int pad_h,
const int pad_w,
const int stride_h,
const int stride_w,
const int dilation_h,
const int dilation_w,
const int parallel_imgs,
const int deformable_group,
at::Tensor grad_im) {
// todo: make sure parallel_imgs is passed in correctly
int height_col =
(height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
int width_col =
(width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
int num_kernels =
channels * ksize_h * ksize_w * height_col * width_col * parallel_imgs;
int channel_per_deformable_group = channels / deformable_group;
at::cuda::CUDAGuard device_guard(data_col.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
data_col.scalar_type(), "deformable_col2im_gpu", ([&] {
const scalar_t* data_col_ = data_col.data_ptr<scalar_t>();
const scalar_t* data_offset_ = data_offset.data_ptr<scalar_t>();
scalar_t* grad_im_ = grad_im.data_ptr<scalar_t>();
deformable_col2im_gpu_kernel<<<
GET_BLOCKS(num_kernels),
CUDA_NUM_THREADS,
0,
stream>>>(
num_kernels,
data_col_,
data_offset_,
channels,
height,
width,
ksize_h,
ksize_w,
pad_h,
pad_w,
stride_h,
stride_w,
dilation_h,
dilation_w,
channel_per_deformable_group,
parallel_imgs,
deformable_group,
height_col,
width_col,
grad_im_);
}));
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("error in deformable_col2im: %s\n", cudaGetErrorString(err));
}
}
void deformable_col2im_coord(
const at::Tensor data_col,
const at::Tensor data_im,
const at::Tensor data_offset,
const int channels,
const int height,
const int width,
const int ksize_h,
const int ksize_w,
const int pad_h,
const int pad_w,
const int stride_h,
const int stride_w,
const int dilation_h,
const int dilation_w,
const int parallel_imgs,
const int deformable_group,
at::Tensor grad_offset) {
int height_col =
(height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
int width_col =
(width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
int num_kernels = height_col * width_col * 2 * ksize_h * ksize_w *
deformable_group * parallel_imgs;
int channel_per_deformable_group =
channels * ksize_h * ksize_w / deformable_group;
at::cuda::CUDAGuard device_guard(data_col.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
data_col.scalar_type(), "deformable_col2im_coord_gpu", ([&] {
const scalar_t* data_col_ = data_col.data_ptr<scalar_t>();
const scalar_t* data_im_ = data_im.data_ptr<scalar_t>();
const scalar_t* data_offset_ = data_offset.data_ptr<scalar_t>();
scalar_t* grad_offset_ = grad_offset.data_ptr<scalar_t>();
deformable_col2im_coord_gpu_kernel<<<
GET_BLOCKS(num_kernels),
CUDA_NUM_THREADS,
0,
stream>>>(
num_kernels,
data_col_,
data_im_,
data_offset_,
channels,
height,
width,
ksize_h,
ksize_w,
pad_h,
pad_w,
stride_h,
stride_w,
dilation_h,
dilation_w,
channel_per_deformable_group,
parallel_imgs,
2 * ksize_h * ksize_w * deformable_group,
deformable_group,
height_col,
width_col,
grad_offset_);
}));
}
} // namespace detectron2
template <typename scalar_t>
__device__ scalar_t dmcn_im2col_bilinear(
const scalar_t* bottom_data,
const int data_width,
const int height,
const int width,
scalar_t h,
scalar_t w) {
int h_low = floor(h);
int w_low = floor(w);
int h_high = h_low + 1;
int w_high = w_low + 1;
scalar_t lh = h - h_low;
scalar_t lw = w - w_low;
scalar_t hh = 1 - lh, hw = 1 - lw;
scalar_t v1 = 0;
if (h_low >= 0 && w_low >= 0)
v1 = bottom_data[h_low * data_width + w_low];
scalar_t v2 = 0;
if (h_low >= 0 && w_high <= width - 1)
v2 = bottom_data[h_low * data_width + w_high];
scalar_t v3 = 0;
if (h_high <= height - 1 && w_low >= 0)
v3 = bottom_data[h_high * data_width + w_low];
scalar_t v4 = 0;
if (h_high <= height - 1 && w_high <= width - 1)
v4 = bottom_data[h_high * data_width + w_high];
scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
return val;
}
template <typename scalar_t>
__device__ scalar_t dmcn_get_gradient_weight(
scalar_t argmax_h,
scalar_t argmax_w,
const int h,
const int w,
const int height,
const int width) {
if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
argmax_w >= width) {
// empty
return 0;
}
int argmax_h_low = floor(argmax_h);
int argmax_w_low = floor(argmax_w);
int argmax_h_high = argmax_h_low + 1;
int argmax_w_high = argmax_w_low + 1;
scalar_t weight = 0;
if (h == argmax_h_low && w == argmax_w_low)
weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
if (h == argmax_h_low && w == argmax_w_high)
weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
if (h == argmax_h_high && w == argmax_w_low)
weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
if (h == argmax_h_high && w == argmax_w_high)
weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
return weight;
}
template <typename scalar_t>
__device__ scalar_t dmcn_get_coordinate_weight(
scalar_t argmax_h,
scalar_t argmax_w,
const int height,
const int width,
const scalar_t* im_data,
const int data_width,
const int bp_dir) {
if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
argmax_w >= width) {
// empty
return 0;
}
int argmax_h_low = floor(argmax_h);
int argmax_w_low = floor(argmax_w);
int argmax_h_high = argmax_h_low + 1;
int argmax_w_high = argmax_w_low + 1;
scalar_t weight = 0;
if (bp_dir == 0) {
if (argmax_h_low >= 0 && argmax_w_low >= 0)
weight += -1 * (argmax_w_low + 1 - argmax_w) *
im_data[argmax_h_low * data_width + argmax_w_low];
if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
weight += -1 * (argmax_w - argmax_w_low) *
im_data[argmax_h_low * data_width + argmax_w_high];
if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
weight += (argmax_w_low + 1 - argmax_w) *
im_data[argmax_h_high * data_width + argmax_w_low];
if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
weight += (argmax_w - argmax_w_low) *
im_data[argmax_h_high * data_width + argmax_w_high];
} else if (bp_dir == 1) {
if (argmax_h_low >= 0 && argmax_w_low >= 0)
weight += -1 * (argmax_h_low + 1 - argmax_h) *
im_data[argmax_h_low * data_width + argmax_w_low];
if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
weight += (argmax_h_low + 1 - argmax_h) *
im_data[argmax_h_low * data_width + argmax_w_high];
if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
weight += -1 * (argmax_h - argmax_h_low) *
im_data[argmax_h_high * data_width + argmax_w_low];
if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
weight += (argmax_h - argmax_h_low) *
im_data[argmax_h_high * data_width + argmax_w_high];
}
return weight;
}
template <typename scalar_t>
__global__ void modulated_deformable_im2col_gpu_kernel(
const int n,
const scalar_t* data_im,
const scalar_t* data_offset,
const scalar_t* data_mask,
const int height,
const int width,
const int kernel_h,
const int kernel_w,
const int pad_h,
const int pad_w,
const int stride_h,
const int stride_w,
const int dilation_h,
const int dilation_w,
const int channel_per_deformable_group,
const int batch_size,
const int num_channels,
const int deformable_group,
const int height_col,
const int width_col,
scalar_t* data_col) {
CUDA_KERNEL_LOOP(index, n) {
// index index of output matrix
const int w_col = index % width_col;
const int h_col = (index / width_col) % height_col;
const int b_col = (index / width_col / height_col) % batch_size;
const int c_im = (index / width_col / height_col) / batch_size;
const int c_col = c_im * kernel_h * kernel_w;
// compute deformable group index
const int deformable_group_index = c_im / channel_per_deformable_group;
const int h_in = h_col * stride_h - pad_h;
const int w_in = w_col * stride_w - pad_w;
scalar_t* data_col_ptr = data_col +
((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
// const float* data_im_ptr = data_im + ((b_col * num_channels + c_im) *
// height + h_in) * width + w_in;
const scalar_t* data_im_ptr =
data_im + (b_col * num_channels + c_im) * height * width;
const scalar_t* data_offset_ptr = data_offset +
(b_col * deformable_group + deformable_group_index) * 2 * kernel_h *
kernel_w * height_col * width_col;
const scalar_t* data_mask_ptr = data_mask +
(b_col * deformable_group + deformable_group_index) * kernel_h *
kernel_w * height_col * width_col;
for (int i = 0; i < kernel_h; ++i) {
for (int j = 0; j < kernel_w; ++j) {
const int data_offset_h_ptr =
((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
const int data_offset_w_ptr =
((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
w_col;
const int data_mask_hw_ptr =
((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
const scalar_t mask = data_mask_ptr[data_mask_hw_ptr];
scalar_t val = static_cast<scalar_t>(0);
const scalar_t h_im = h_in + i * dilation_h + offset_h;
const scalar_t w_im = w_in + j * dilation_w + offset_w;
// if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {
if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) {
// const float map_h = i * dilation_h + offset_h;
// const float map_w = j * dilation_w + offset_w;
// const int cur_height = height - h_in;
// const int cur_width = width - w_in;
// val = dmcn_im2col_bilinear(data_im_ptr, width, cur_height,
// cur_width, map_h, map_w);
val = dmcn_im2col_bilinear(
data_im_ptr, width, height, width, h_im, w_im);
}
*data_col_ptr = val * mask;
data_col_ptr += batch_size * height_col * width_col;
// data_col_ptr += height_col * width_col;
}
}
}
}
template <typename scalar_t>
__global__ void modulated_deformable_col2im_gpu_kernel(
const int n,
const scalar_t* data_col,
const scalar_t* data_offset,
const scalar_t* data_mask,
const int channels,
const int height,
const int width,
const int kernel_h,
const int kernel_w,
const int pad_h,
const int pad_w,
const int stride_h,
const int stride_w,
const int dilation_h,
const int dilation_w,
const int channel_per_deformable_group,
const int batch_size,
const int deformable_group,
const int height_col,
const int width_col,
scalar_t* grad_im) {
CUDA_KERNEL_LOOP(index, n) {
const int j = (index / width_col / height_col / batch_size) % kernel_w;
const int i =
(index / width_col / height_col / batch_size / kernel_w) % kernel_h;
const int c =
index / width_col / height_col / batch_size / kernel_w / kernel_h;
// compute the start and end of the output
const int deformable_group_index = c / channel_per_deformable_group;
int w_out = index % width_col;
int h_out = (index / width_col) % height_col;
int b = (index / width_col / height_col) % batch_size;
int w_in = w_out * stride_w - pad_w;
int h_in = h_out * stride_h - pad_h;
const scalar_t* data_offset_ptr = data_offset +
(b * deformable_group + deformable_group_index) * 2 * kernel_h *
kernel_w * height_col * width_col;
const scalar_t* data_mask_ptr = data_mask +
(b * deformable_group + deformable_group_index) * kernel_h * kernel_w *
height_col * width_col;
const int data_offset_h_ptr =
((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
const int data_offset_w_ptr =
((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
const int data_mask_hw_ptr =
((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
const scalar_t mask = data_mask_ptr[data_mask_hw_ptr];
const scalar_t cur_inv_h_data = h_in + i * dilation_h + offset_h;
const scalar_t cur_inv_w_data = w_in + j * dilation_w + offset_w;
const scalar_t cur_top_grad = data_col[index] * mask;
const int cur_h = (int)cur_inv_h_data;
const int cur_w = (int)cur_inv_w_data;
for (int dy = -2; dy <= 2; dy++) {
for (int dx = -2; dx <= 2; dx++) {
if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
abs(cur_inv_w_data - (cur_w + dx)) < 1) {
int cur_bottom_grad_pos =
((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
scalar_t weight = dmcn_get_gradient_weight(
cur_inv_h_data,
cur_inv_w_data,
cur_h + dy,
cur_w + dx,
height,
width);
atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
}
}
}
}
}
template <typename scalar_t>
__global__ void modulated_deformable_col2im_coord_gpu_kernel(
const int n,
const scalar_t* data_col,
const scalar_t* data_im,
const scalar_t* data_offset,
const scalar_t* data_mask,
const int channels,
const int height,
const int width,
const int kernel_h,
const int kernel_w,
const int pad_h,
const int pad_w,
const int stride_h,
const int stride_w,
const int dilation_h,
const int dilation_w,
const int channel_per_deformable_group,
const int batch_size,
const int offset_channels,
const int deformable_group,
const int height_col,
const int width_col,
scalar_t* grad_offset,
scalar_t* grad_mask) {
CUDA_KERNEL_LOOP(index, n) {
scalar_t val = 0, mval = 0;
int w = index % width_col;
int h = (index / width_col) % height_col;
int c = (index / width_col / height_col) % offset_channels;
int b = (index / width_col / height_col) / offset_channels;
// compute the start and end of the output
const int deformable_group_index = c / (2 * kernel_h * kernel_w);
const int col_step = kernel_h * kernel_w;
int cnt = 0;
const scalar_t* data_col_ptr = data_col +
deformable_group_index * channel_per_deformable_group * batch_size *
width_col * height_col;
const scalar_t* data_im_ptr = data_im +
(b * deformable_group + deformable_group_index) *
channel_per_deformable_group / kernel_h / kernel_w * height * width;
const scalar_t* data_offset_ptr = data_offset +
(b * deformable_group + deformable_group_index) * 2 * kernel_h *
kernel_w * height_col * width_col;
const scalar_t* data_mask_ptr = data_mask +
(b * deformable_group + deformable_group_index) * kernel_h * kernel_w *
height_col * width_col;
const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;
col_c += col_step) {
const int col_pos =
(((col_c * batch_size + b) * height_col) + h) * width_col + w;
const int bp_dir = offset_c % 2;
int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
int i =
(col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
int w_out = col_pos % width_col;
int h_out = (col_pos / width_col) % height_col;
int w_in = w_out * stride_w - pad_w;
int h_in = h_out * stride_h - pad_h;
const int data_offset_h_ptr =
(((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
const int data_offset_w_ptr =
(((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
w_out);
const int data_mask_hw_ptr =
(((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
const scalar_t mask = data_mask_ptr[data_mask_hw_ptr];
scalar_t inv_h = h_in + i * dilation_h + offset_h;
scalar_t inv_w = w_in + j * dilation_w + offset_w;
if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) {
inv_h = inv_w = -2;
} else {
mval += data_col_ptr[col_pos] *
dmcn_im2col_bilinear(
data_im_ptr + cnt * height * width,
width,
height,
width,
inv_h,
inv_w);
}
const scalar_t weight = dmcn_get_coordinate_weight(
inv_h,
inv_w,
height,
width,
data_im_ptr + cnt * height * width,
width,
bp_dir);
val += weight * data_col_ptr[col_pos] * mask;
cnt += 1;
}
// KERNEL_ASSIGN(grad_offset[index], offset_req, val);
grad_offset[index] = val;
if (offset_c % 2 == 0)
// KERNEL_ASSIGN(grad_mask[(((b * deformable_group +
// deformable_group_index) * kernel_h * kernel_w + offset_c / 2) *
// height_col + h) * width_col + w], mask_req, mval);
grad_mask
[(((b * deformable_group + deformable_group_index) * kernel_h *
kernel_w +
offset_c / 2) *
height_col +
h) *
width_col +
w] = mval;
}
}
namespace detectron2 {
void modulated_deformable_im2col_cuda(
const at::Tensor data_im,
const at::Tensor data_offset,
const at::Tensor data_mask,
const int batch_size,
const int channels,
const int height_im,
const int width_im,
const int height_col,
const int width_col,
const int kernel_h,
const int kenerl_w,
const int pad_h,
const int pad_w,
const int stride_h,
const int stride_w,
const int dilation_h,
const int dilation_w,
const int deformable_group,
at::Tensor data_col) {
// num_axes should be smaller than block size
const int channel_per_deformable_group = channels / deformable_group;
const int num_kernels = channels * batch_size * height_col * width_col;
at::cuda::CUDAGuard device_guard(data_im.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
data_im.scalar_type(), "modulated_deformable_im2col_gpu", ([&] {
const scalar_t* data_im_ = data_im.data_ptr<scalar_t>();
const scalar_t* data_offset_ = data_offset.data_ptr<scalar_t>();
const scalar_t* data_mask_ = data_mask.data_ptr<scalar_t>();
scalar_t* data_col_ = data_col.data_ptr<scalar_t>();
modulated_deformable_im2col_gpu_kernel<<<
GET_BLOCKS(num_kernels),
CUDA_NUM_THREADS,
0,
stream>>>(
num_kernels,
data_im_,
data_offset_,
data_mask_,
height_im,
width_im,
kernel_h,
kenerl_w,
pad_h,
pad_w,
stride_h,
stride_w,
dilation_h,
dilation_w,
channel_per_deformable_group,
batch_size,
channels,
deformable_group,
height_col,
width_col,
data_col_);
}));
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf(
"error in modulated_deformable_im2col_cuda: %s\n",
cudaGetErrorString(err));
}
}
void modulated_deformable_col2im_cuda(
const at::Tensor data_col,
const at::Tensor data_offset,
const at::Tensor data_mask,
const int batch_size,
const int channels,
const int height_im,
const int width_im,
const int height_col,
const int width_col,
const int kernel_h,
const int kernel_w,
const int pad_h,
const int pad_w,
const int stride_h,
const int stride_w,
const int dilation_h,
const int dilation_w,
const int deformable_group,
at::Tensor grad_im) {
const int channel_per_deformable_group = channels / deformable_group;
const int num_kernels =
channels * kernel_h * kernel_w * batch_size * height_col * width_col;
at::cuda::CUDAGuard device_guard(data_col.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
data_col.scalar_type(), "modulated_deformable_col2im_gpu", ([&] {
const scalar_t* data_col_ = data_col.data_ptr<scalar_t>();
const scalar_t* data_offset_ = data_offset.data_ptr<scalar_t>();
const scalar_t* data_mask_ = data_mask.data_ptr<scalar_t>();
scalar_t* grad_im_ = grad_im.data_ptr<scalar_t>();
modulated_deformable_col2im_gpu_kernel<<<
GET_BLOCKS(num_kernels),
CUDA_NUM_THREADS,
0,
stream>>>(
num_kernels,
data_col_,
data_offset_,
data_mask_,
channels,
height_im,
width_im,
kernel_h,
kernel_w,
pad_h,
pad_w,
stride_h,
stride_w,
dilation_h,
dilation_w,
channel_per_deformable_group,
batch_size,
deformable_group,
height_col,
width_col,
grad_im_);
}));
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf(
"error in modulated_deformable_col2im_cuda: %s\n",
cudaGetErrorString(err));
}
}
void modulated_deformable_col2im_coord_cuda(
const at::Tensor data_col,
const at::Tensor data_im,
const at::Tensor data_offset,
const at::Tensor data_mask,
const int batch_size,
const int channels,
const int height_im,
const int width_im,
const int height_col,
const int width_col,
const int kernel_h,
const int kernel_w,
const int pad_h,
const int pad_w,
const int stride_h,
const int stride_w,
const int dilation_h,
const int dilation_w,
const int deformable_group,
at::Tensor grad_offset,
at::Tensor grad_mask) {
const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h *
kernel_w * deformable_group;
const int channel_per_deformable_group =
channels * kernel_h * kernel_w / deformable_group;
at::cuda::CUDAGuard device_guard(data_col.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
data_col.scalar_type(), "modulated_deformable_col2im_coord_gpu", ([&] {
const scalar_t* data_col_ = data_col.data_ptr<scalar_t>();
const scalar_t* data_im_ = data_im.data_ptr<scalar_t>();
const scalar_t* data_offset_ = data_offset.data_ptr<scalar_t>();
const scalar_t* data_mask_ = data_mask.data_ptr<scalar_t>();
scalar_t* grad_offset_ = grad_offset.data_ptr<scalar_t>();
scalar_t* grad_mask_ = grad_mask.data_ptr<scalar_t>();
modulated_deformable_col2im_coord_gpu_kernel<<<
GET_BLOCKS(num_kernels),
CUDA_NUM_THREADS,
0,
stream>>>(
num_kernels,
data_col_,
data_im_,
data_offset_,
data_mask_,
channels,
height_im,
width_im,
kernel_h,
kernel_w,
pad_h,
pad_w,
stride_h,
stride_w,
dilation_h,
dilation_w,
channel_per_deformable_group,
batch_size,
2 * kernel_h * kernel_w * deformable_group,
deformable_group,
height_col,
width_col,
grad_offset_,
grad_mask_);
}));
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf(
"error in modulated_deformable_col2im_coord_cuda: %s\n",
cudaGetErrorString(err));
}
}
} // namespace detectron2
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment