Commit 6f3c5f1c authored by limm's avatar limm
Browse files

support v1.4.0

parent 6f674c7e
......@@ -2,8 +2,6 @@
# modified from
# https://github.com/Megvii-BaseDetection/cvpods/blob/master/cvpods/layers/border_align.py
from typing import Tuple
import torch
import torch.nn as nn
from torch.autograd import Function
......@@ -23,8 +21,7 @@ class BorderAlignFunction(Function):
'mmcv::MMCVBorderAlign', input, boxes, pool_size_i=pool_size)
@staticmethod
def forward(ctx, input: torch.Tensor, boxes: torch.Tensor,
pool_size: int) -> torch.Tensor:
def forward(ctx, input, boxes, pool_size):
ctx.pool_size = pool_size
ctx.input_shape = input.size()
......@@ -48,8 +45,7 @@ class BorderAlignFunction(Function):
@staticmethod
@once_differentiable
def backward(ctx,
grad_output: torch.Tensor) -> Tuple[torch.Tensor, None, None]:
def backward(ctx, grad_output):
boxes, argmax_idx = ctx.saved_tensors
grad_input = grad_output.new_zeros(ctx.input_shape)
# complex head architecture may cause grad_output uncontiguous
......@@ -76,25 +72,24 @@ class BorderAlign(nn.Module):
For each border line (e.g. top, left, bottom or right) of each box,
border_align does the following:
1. uniformly samples ``pool_size`` +1 positions on this line, involving
1. uniformly samples `pool_size`+1 positions on this line, involving \
the start and end points.
2. the corresponding features on these points are computed by bilinear
interpolation.
3. max pooling over all the ``pool_size`` +1 positions are used for
2. the corresponding features on these points are computed by \
bilinear interpolation.
3. max pooling over all the `pool_size`+1 positions are used for \
computing pooled feature.
Args:
pool_size (int): number of positions sampled over the boxes' borders
(e.g. top, bottom, left, right).
"""
def __init__(self, pool_size: int):
super().__init__()
def __init__(self, pool_size):
super(BorderAlign, self).__init__()
self.pool_size = pool_size
def forward(self, input: torch.Tensor,
boxes: torch.Tensor) -> torch.Tensor:
def forward(self, input, boxes):
"""
Args:
input: Features with shape [N,4C,H,W]. Channels ranged in [0,C),
......@@ -103,7 +98,7 @@ class BorderAlign(nn.Module):
boxes: Boxes with shape [N,H*W,4]. Coordinate format (x1,y1,x2,y2).
Returns:
torch.Tensor: Pooled features with shape [N,C,H*W,4]. The order is
Tensor: Pooled features with shape [N,C,H*W,4]. The order is
(top,left,bottom,right) for the last dimension.
"""
return border_align(input, boxes, self.pool_size)
......
# Copyright (c) OpenMMLab. All rights reserved.
import torch
from ..utils import ext_loader
ext_module = ext_loader.load_ext('_ext', ['box_iou_quadri'])
def box_iou_quadri(bboxes1: torch.Tensor,
bboxes2: torch.Tensor,
mode: str = 'iou',
aligned: bool = False) -> torch.Tensor:
"""Return intersection-over-union (Jaccard index) of boxes.
Both sets of boxes are expected to be in
(x1, y1, ..., x4, y4) format.
If ``aligned`` is ``False``, then calculate the ious between each bbox
of bboxes1 and bboxes2, otherwise the ious between each aligned pair of
bboxes1 and bboxes2.
Args:
bboxes1 (torch.Tensor): quadrilateral bboxes 1. It has shape (N, 8),
indicating (x1, y1, ..., x4, y4) for each row.
bboxes2 (torch.Tensor): quadrilateral bboxes 2. It has shape (M, 8),
indicating (x1, y1, ..., x4, y4) for each row.
mode (str): "iou" (intersection over union) or iof (intersection over
foreground).
Returns:
torch.Tensor: Return the ious betweens boxes. If ``aligned`` is
``False``, the shape of ious is (N, M) else (N,).
"""
assert mode in ['iou', 'iof']
mode_dict = {'iou': 0, 'iof': 1}
mode_flag = mode_dict[mode]
rows = bboxes1.size(0)
cols = bboxes2.size(0)
if aligned:
ious = bboxes1.new_zeros(rows)
else:
ious = bboxes1.new_zeros(rows * cols)
bboxes1 = bboxes1.contiguous()
bboxes2 = bboxes2.contiguous()
ext_module.box_iou_quadri(
bboxes1, bboxes2, ious, mode_flag=mode_flag, aligned=aligned)
if not aligned:
ious = ious.view(rows, cols)
return ious
# Copyright (c) OpenMMLab. All rights reserved.
import torch
from ..utils import ext_loader
ext_module = ext_loader.load_ext('_ext', ['box_iou_rotated'])
def box_iou_rotated(bboxes1: torch.Tensor,
bboxes2: torch.Tensor,
mode: str = 'iou',
aligned: bool = False,
clockwise: bool = True) -> torch.Tensor:
def box_iou_rotated(bboxes1, bboxes2, mode='iou', aligned=False):
"""Return intersection-over-union (Jaccard index) of boxes.
Both sets of boxes are expected to be in
......@@ -20,110 +14,18 @@ def box_iou_rotated(bboxes1: torch.Tensor,
of bboxes1 and bboxes2, otherwise the ious between each aligned pair of
bboxes1 and bboxes2.
.. note::
The operator assumes:
1) The positive direction along x axis is left -> right.
2) The positive direction along y axis is top -> down.
3) The w border is in parallel with x axis when angle = 0.
However, there are 2 opposite definitions of the positive angular
direction, clockwise (CW) and counter-clockwise (CCW). MMCV supports
both definitions and uses CW by default.
Please set ``clockwise=False`` if you are using the CCW definition.
The coordinate system when ``clockwise`` is ``True`` (default)
.. code-block:: none
0-------------------> x (0 rad)
| A-------------B
| | |
| | box h
| | angle=0 |
| D------w------C
v
y (pi/2 rad)
In such coordination system the rotation matrix is
.. math::
\\begin{pmatrix}
\\cos\\alpha & -\\sin\\alpha \\\\
\\sin\\alpha & \\cos\\alpha
\\end{pmatrix}
The coordinates of the corner point A can be calculated as:
.. math::
P_A=
\\begin{pmatrix} x_A \\\\ y_A\\end{pmatrix}
=
\\begin{pmatrix} x_{center} \\\\ y_{center}\\end{pmatrix} +
\\begin{pmatrix}\\cos\\alpha & -\\sin\\alpha \\\\
\\sin\\alpha & \\cos\\alpha\\end{pmatrix}
\\begin{pmatrix} -0.5w \\\\ -0.5h\\end{pmatrix} \\\\
=
\\begin{pmatrix} x_{center}-0.5w\\cos\\alpha+0.5h\\sin\\alpha
\\\\
y_{center}-0.5w\\sin\\alpha-0.5h\\cos\\alpha\\end{pmatrix}
The coordinate system when ``clockwise`` is ``False``
.. code-block:: none
0-------------------> x (0 rad)
| A-------------B
| | |
| | box h
| | angle=0 |
| D------w------C
v
y (-pi/2 rad)
In such coordination system the rotation matrix is
.. math::
\\begin{pmatrix}
\\cos\\alpha & \\sin\\alpha \\\\
-\\sin\\alpha & \\cos\\alpha
\\end{pmatrix}
The coordinates of the corner point A can be calculated as:
.. math::
P_A=
\\begin{pmatrix} x_A \\\\ y_A\\end{pmatrix}
=
\\begin{pmatrix} x_{center} \\\\ y_{center}\\end{pmatrix} +
\\begin{pmatrix}\\cos\\alpha & \\sin\\alpha \\\\
-\\sin\\alpha & \\cos\\alpha\\end{pmatrix}
\\begin{pmatrix} -0.5w \\\\ -0.5h\\end{pmatrix} \\\\
=
\\begin{pmatrix} x_{center}-0.5w\\cos\\alpha-0.5h\\sin\\alpha
\\\\
y_{center}+0.5w\\sin\\alpha-0.5h\\cos\\alpha\\end{pmatrix}
Args:
boxes1 (torch.Tensor): rotated bboxes 1. It has shape (N, 5),
indicating (x, y, w, h, theta) for each row. Note that theta is in
radian.
boxes2 (torch.Tensor): rotated bboxes 2. It has shape (M, 5),
indicating (x, y, w, h, theta) for each row. Note that theta is in
radian.
Arguments:
boxes1 (Tensor): rotated bboxes 1. \
It has shape (N, 5), indicating (x, y, w, h, theta) for each row.
Note that theta is in radian.
boxes2 (Tensor): rotated bboxes 2. \
It has shape (M, 5), indicating (x, y, w, h, theta) for each row.
Note that theta is in radian.
mode (str): "iou" (intersection over union) or iof (intersection over
foreground).
clockwise (bool): flag indicating whether the positive angular
orientation is clockwise. default True.
`New in version 1.4.3.`
Returns:
torch.Tensor: Return the ious betweens boxes. If ``aligned`` is
``False``, the shape of ious is (N, M) else (N,).
ious(Tensor): shape (N, M) if aligned == False else shape (N,)
"""
assert mode in ['iou', 'iof']
mode_dict = {'iou': 0, 'iof': 1}
......@@ -133,12 +35,7 @@ def box_iou_rotated(bboxes1: torch.Tensor,
if aligned:
ious = bboxes1.new_zeros(rows)
else:
ious = bboxes1.new_zeros(rows * cols)
if not clockwise:
flip_mat = bboxes1.new_ones(bboxes1.shape[-1])
flip_mat[-1] = -1
bboxes1 = bboxes1 * flip_mat
bboxes2 = bboxes2 * flip_mat
ious = bboxes1.new_zeros((rows * cols))
bboxes1 = bboxes1.contiguous()
bboxes2 = bboxes2.contiguous()
ext_module.box_iou_rotated(
......
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Tuple
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmengine.model import normal_init, xavier_init
from mmengine.registry import MODELS
from torch import Tensor
from torch.autograd import Function
from torch.nn.modules.module import Module
from ..cnn import UPSAMPLE_LAYERS, normal_init, xavier_init
from ..utils import ext_loader
ext_module = ext_loader.load_ext('_ext', [
......@@ -21,8 +17,7 @@ ext_module = ext_loader.load_ext('_ext', [
class CARAFENaiveFunction(Function):
@staticmethod
def symbolic(g, features: Tensor, masks: Tensor, kernel_size: int,
group_size: int, scale_factor: int) -> Tensor:
def symbolic(g, features, masks, kernel_size, group_size, scale_factor):
return g.op(
'mmcv::MMCVCARAFENaive',
features,
......@@ -32,8 +27,7 @@ class CARAFENaiveFunction(Function):
scale_factor_f=scale_factor)
@staticmethod
def forward(ctx, features: Tensor, masks: Tensor, kernel_size: int,
group_size: int, scale_factor: int) -> Tensor:
def forward(ctx, features, masks, kernel_size, group_size, scale_factor):
assert scale_factor >= 1
assert masks.size(1) == kernel_size * kernel_size * group_size
assert masks.size(-1) == features.size(-1) * scale_factor
......@@ -56,15 +50,12 @@ class CARAFENaiveFunction(Function):
group_size=group_size,
scale_factor=scale_factor)
if features.requires_grad or masks.requires_grad or \
torch.__version__ == 'parrots':
if features.requires_grad or masks.requires_grad:
ctx.save_for_backward(features, masks)
return output
@staticmethod
def backward(
ctx,
grad_output: Tensor) -> Tuple[Tensor, Tensor, None, None, None]:
def backward(ctx, grad_output):
assert grad_output.is_cuda
features, masks = ctx.saved_tensors
......@@ -92,8 +83,8 @@ carafe_naive = CARAFENaiveFunction.apply
class CARAFENaive(Module):
def __init__(self, kernel_size: int, group_size: int, scale_factor: int):
super().__init__()
def __init__(self, kernel_size, group_size, scale_factor):
super(CARAFENaive, self).__init__()
assert isinstance(kernel_size, int) and isinstance(
group_size, int) and isinstance(scale_factor, int)
......@@ -101,7 +92,7 @@ class CARAFENaive(Module):
self.group_size = group_size
self.scale_factor = scale_factor
def forward(self, features: Tensor, masks: Tensor) -> Tensor:
def forward(self, features, masks):
return carafe_naive(features, masks, self.kernel_size, self.group_size,
self.scale_factor)
......@@ -109,8 +100,7 @@ class CARAFENaive(Module):
class CARAFEFunction(Function):
@staticmethod
def symbolic(g, features: Tensor, masks: Tensor, kernel_size: int,
group_size: int, scale_factor: int) -> Tensor:
def symbolic(g, features, masks, kernel_size, group_size, scale_factor):
return g.op(
'mmcv::MMCVCARAFE',
features,
......@@ -120,8 +110,7 @@ class CARAFEFunction(Function):
scale_factor_f=scale_factor)
@staticmethod
def forward(ctx, features: Tensor, masks: Tensor, kernel_size: int,
group_size: int, scale_factor: int) -> Tensor:
def forward(ctx, features, masks, kernel_size, group_size, scale_factor):
assert scale_factor >= 1
assert masks.size(1) == kernel_size * kernel_size * group_size
assert masks.size(-1) == features.size(-1) * scale_factor
......@@ -150,15 +139,14 @@ class CARAFEFunction(Function):
group_size=group_size,
scale_factor=scale_factor)
if features.requires_grad or masks.requires_grad or \
torch.__version__ == 'parrots':
if features.requires_grad or masks.requires_grad:
ctx.save_for_backward(features, masks, rfeatures)
return output
@staticmethod
def backward(
ctx,
grad_output: Tensor) -> Tuple[Tensor, Tensor, None, None, None]:
def backward(ctx, grad_output):
assert grad_output.is_cuda
features, masks, rfeatures = ctx.saved_tensors
kernel_size = ctx.kernel_size
group_size = ctx.group_size
......@@ -192,8 +180,7 @@ carafe = CARAFEFunction.apply
class CARAFE(Module):
""" CARAFE: Content-Aware ReAssembly of FEatures
Please refer to `CARAFE: Content-Aware ReAssembly of FEatures
<https://arxiv.org/abs/1905.02188>`_ for more details.
Please refer to https://arxiv.org/abs/1905.02188 for more details.
Args:
kernel_size (int): reassemble kernel size
......@@ -204,8 +191,8 @@ class CARAFE(Module):
upsampled feature map
"""
def __init__(self, kernel_size: int, group_size: int, scale_factor: int):
super().__init__()
def __init__(self, kernel_size, group_size, scale_factor):
super(CARAFE, self).__init__()
assert isinstance(kernel_size, int) and isinstance(
group_size, int) and isinstance(scale_factor, int)
......@@ -213,19 +200,19 @@ class CARAFE(Module):
self.group_size = group_size
self.scale_factor = scale_factor
def forward(self, features: Tensor, masks: Tensor) -> Tensor:
def forward(self, features, masks):
return carafe(features, masks, self.kernel_size, self.group_size,
self.scale_factor)
@MODELS.register_module(name='carafe')
@UPSAMPLE_LAYERS.register_module(name='carafe')
class CARAFEPack(nn.Module):
"""A unified package of CARAFE upsampler that contains: 1) channel
compressor 2) content encoder 3) CARAFE op.
Official implementation of ICCV 2019 paper
`CARAFE: Content-Aware ReAssembly of FEatures
<https://arxiv.org/abs/1905.02188>`_.
CARAFE: Content-Aware ReAssembly of FEatures
Please refer to https://arxiv.org/abs/1905.02188 for more details.
Args:
channels (int): input feature channels
......@@ -241,14 +228,14 @@ class CARAFEPack(nn.Module):
"""
def __init__(self,
channels: int,
scale_factor: int,
up_kernel: int = 5,
up_group: int = 1,
encoder_kernel: int = 3,
encoder_dilation: int = 1,
compressed_channels: int = 64):
super().__init__()
channels,
scale_factor,
up_kernel=5,
up_group=1,
encoder_kernel=3,
encoder_dilation=1,
compressed_channels=64):
super(CARAFEPack, self).__init__()
self.channels = channels
self.scale_factor = scale_factor
self.up_kernel = up_kernel
......@@ -274,7 +261,7 @@ class CARAFEPack(nn.Module):
xavier_init(m, distribution='uniform')
normal_init(self.content_encoder, std=0.001)
def kernel_normalizer(self, mask: Tensor) -> Tensor:
def kernel_normalizer(self, mask):
mask = F.pixel_shuffle(mask, self.scale_factor)
n, mask_c, h, w = mask.size()
# use float division explicitly,
......@@ -287,11 +274,11 @@ class CARAFEPack(nn.Module):
return mask
def feature_reassemble(self, x: Tensor, mask: Tensor) -> Tensor:
def feature_reassemble(self, x, mask):
x = carafe(x, mask, self.up_kernel, self.up_group, self.scale_factor)
return x
def forward(self, x: Tensor) -> Tensor:
def forward(self, x):
compressed_x = self.channel_compressor(x)
mask = self.content_encoder(compressed_x)
mask = self.kernel_normalizer(mask)
......
......@@ -2,12 +2,11 @@
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmengine.registry import MODELS
from mmcv.cnn import Scale
from mmcv.cnn import PLUGIN_LAYERS, Scale
def NEG_INF_DIAG(n: int, device: torch.device) -> torch.Tensor:
def NEG_INF_DIAG(n, device):
"""Returns a diagonal matrix of size [n, n].
The diagonal are all "-inf". This is for avoiding calculating the
......@@ -16,7 +15,7 @@ def NEG_INF_DIAG(n: int, device: torch.device) -> torch.Tensor:
return torch.diag(torch.tensor(float('-inf')).to(device).repeat(n), 0)
@MODELS.register_module()
@PLUGIN_LAYERS.register_module()
class CrissCrossAttention(nn.Module):
"""Criss-Cross Attention Module.
......@@ -42,7 +41,7 @@ class CrissCrossAttention(nn.Module):
in_channels (int): Channels of the input feature map.
"""
def __init__(self, in_channels: int) -> None:
def __init__(self, in_channels):
super().__init__()
self.query_conv = nn.Conv2d(in_channels, in_channels // 8, 1)
self.key_conv = nn.Conv2d(in_channels, in_channels // 8, 1)
......@@ -50,15 +49,14 @@ class CrissCrossAttention(nn.Module):
self.gamma = Scale(0.)
self.in_channels = in_channels
def forward(self, x: torch.Tensor) -> torch.Tensor:
def forward(self, x):
"""forward function of Criss-Cross Attention.
Args:
x (torch.Tensor): Input feature with the shape of
(batch_size, in_channels, height, width).
x (Tensor): Input feature. \
shape (batch_size, in_channels, height, width)
Returns:
torch.Tensor: Output of the layer, with the shape of
Tensor: Output of the layer, with shape of \
(batch_size, in_channels, height, width)
"""
B, C, H, W = x.size()
......@@ -79,7 +77,7 @@ class CrissCrossAttention(nn.Module):
return out
def __repr__(self) -> str:
def __repr__(self):
s = self.__class__.__name__
s += f'(in_channels={self.in_channels})'
return s
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Sequence, Tuple
import torch
from torch import Tensor
from torch.autograd import Function
from torch.autograd.function import once_differentiable
from ..utils import ext_loader
ext_module = ext_loader.load_ext(
'_ext', ['chamfer_distance_forward', 'chamfer_distance_backward'])
class ChamferDistanceFunction(Function):
"""This is an implementation of the 2D Chamfer Distance.
It has been used in the paper `Oriented RepPoints for Aerial Object
Detection (CVPR 2022) <https://arxiv.org/abs/2105.11111>_`.
"""
@staticmethod
def forward(ctx, xyz1: Tensor, xyz2: Tensor) -> Sequence[Tensor]:
"""
Args:
xyz1 (Tensor): Point set with shape (B, N, 2).
xyz2 (Tensor): Point set with shape (B, N, 2).
Returns:
Sequence[Tensor]:
- dist1 (Tensor): Chamfer distance (xyz1 to xyz2) with
shape (B, N).
- dist2 (Tensor): Chamfer distance (xyz2 to xyz1) with
shape (B, N).
- idx1 (Tensor): Index of chamfer distance (xyz1 to xyz2)
with shape (B, N), which be used in compute gradient.
- idx2 (Tensor): Index of chamfer distance (xyz2 to xyz2)
with shape (B, N), which be used in compute gradient.
"""
batch_size, n, _ = xyz1.size()
_, m, _ = xyz2.size()
device = xyz1.device
xyz1 = xyz1.contiguous()
xyz2 = xyz2.contiguous()
dist1 = torch.zeros(batch_size, n).to(device)
dist2 = torch.zeros(batch_size, m).to(device)
idx1 = torch.zeros(batch_size, n).type(torch.IntTensor).to(device)
idx2 = torch.zeros(batch_size, m).type(torch.IntTensor).to(device)
ext_module.chamfer_distance_forward(xyz1, xyz2, dist1, dist2, idx1,
idx2)
ctx.save_for_backward(xyz1, xyz2, idx1, idx2)
return dist1, dist2, idx1, idx2
@staticmethod
@once_differentiable
def backward(ctx,
grad_dist1: Tensor,
grad_dist2: Tensor,
grad_idx1=None,
grad_idx2=None) -> Tuple[Tensor, Tensor]:
"""
Args:
grad_dist1 (Tensor): Gradient of chamfer distance
(xyz1 to xyz2) with shape (B, N).
grad_dist2 (Tensor): Gradient of chamfer distance
(xyz2 to xyz1) with shape (B, N).
Returns:
Tuple[Tensor, Tensor]:
- grad_xyz1 (Tensor): Gradient of the point set with shape \
(B, N, 2).
- grad_xyz2 (Tensor):Gradient of the point set with shape \
(B, N, 2).
"""
xyz1, xyz2, idx1, idx2 = ctx.saved_tensors
device = grad_dist1.device
grad_dist1 = grad_dist1.contiguous()
grad_dist2 = grad_dist2.contiguous()
grad_xyz1 = torch.zeros(xyz1.size()).to(device)
grad_xyz2 = torch.zeros(xyz2.size()).to(device)
ext_module.chamfer_distance_backward(xyz1, xyz2, idx1, idx2,
grad_dist1, grad_dist2, grad_xyz1,
grad_xyz2)
return grad_xyz1, grad_xyz2
chamfer_distance = ChamferDistanceFunction.apply
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Union
import numpy as np
import torch
......@@ -9,22 +7,21 @@ from ..utils import ext_loader
ext_module = ext_loader.load_ext('_ext', ['contour_expand'])
def contour_expand(kernel_mask: Union[np.array, torch.Tensor],
internal_kernel_label: Union[np.array, torch.Tensor],
min_kernel_area: int, kernel_num: int) -> list:
def contour_expand(kernel_mask, internal_kernel_label, min_kernel_area,
kernel_num):
"""Expand kernel contours so that foreground pixels are assigned into
instances.
Args:
kernel_mask (np.array or torch.Tensor): The instance kernel mask with
Arguments:
kernel_mask (np.array or Tensor): The instance kernel mask with
size hxw.
internal_kernel_label (np.array or torch.Tensor): The instance internal
internal_kernel_label (np.array or Tensor): The instance internal
kernel label with size hxw.
min_kernel_area (int): The minimum kernel area.
kernel_num (int): The instance kernel number.
Returns:
list: The instance index map with size hxw.
label (list): The instance index map with size hxw.
"""
assert isinstance(kernel_mask, (torch.Tensor, np.ndarray))
assert isinstance(internal_kernel_label, (torch.Tensor, np.ndarray))
......@@ -45,7 +42,7 @@ def contour_expand(kernel_mask: Union[np.array, torch.Tensor],
internal_kernel_label,
min_kernel_area=min_kernel_area,
kernel_num=kernel_num)
label = label.tolist() # type: ignore
label = label.tolist()
else:
label = ext_module.contour_expand(kernel_mask, internal_kernel_label,
min_kernel_area, kernel_num)
......
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# NVIDIA CORPORATION and its licensors retain all intellectual property
# and proprietary rights in and to this software, related documentation
# and any modifications thereto. Any use, reproduction, disclosure or
# distribution of this software and related documentation without an express
# license agreement from NVIDIA CORPORATION is strictly prohibited.
# source: https://github.com/NVlabs/stylegan3/blob/main/torch_utils/ops/conv2d_gradfix.py # noqa
"""Custom replacement for `torch.nn.functional.conv2d` that supports
arbitrarily high order gradients with zero performance penalty."""
import contextlib
import warnings
from typing import Dict, Optional, Tuple, Union
import torch
from mmengine.utils import digit_version
enabled = True
weight_gradients_disabled = False
@contextlib.contextmanager
def no_weight_gradients(disable=True):
global weight_gradients_disabled
old = weight_gradients_disabled
if disable:
weight_gradients_disabled = True
yield
weight_gradients_disabled = old
def conv2d(input: torch.Tensor,
weight: torch.Tensor,
bias: Optional[torch.Tensor] = None,
stride: Union[int, Tuple[int, ...]] = 1,
padding: Union[int, Tuple[int, ...]] = 0,
dilation: Union[int, Tuple[int, ...]] = 1,
groups: int = 1):
flag = True
if digit_version(torch.__version__) >= digit_version('1.10.0'):
warnings.warn('Since '
'aten:cudnn_convolution_backward_weight is '
f'not supported in torch=={torch.__version__},'
' rolling back to `torch.nn.functional.conv2d`')
flag = False
if _should_use_custom_op(input) and flag:
return _conv2d_gradfix(
transpose=False,
weight_shape=weight.shape,
stride=stride,
padding=padding,
output_padding=0,
dilation=dilation,
groups=groups).apply(input, weight, bias)
return torch.nn.functional.conv2d(
input=input,
weight=weight,
bias=bias,
stride=stride,
padding=padding,
dilation=dilation,
groups=groups)
def conv_transpose2d(input: torch.Tensor,
weight: torch.Tensor,
bias: Optional[torch.Tensor] = None,
stride: Union[int, Tuple[int, ...]] = 1,
padding: Union[int, Tuple[int, ...]] = 0,
output_padding: Union[int, Tuple[int, ...]] = 0,
groups: int = 1,
dilation: Union[int, Tuple[int, ...]] = 1):
if _should_use_custom_op(input):
return _conv2d_gradfix(
transpose=True,
weight_shape=weight.shape,
stride=stride,
padding=padding,
output_padding=output_padding,
groups=groups,
dilation=dilation).apply(input, weight, bias)
return torch.nn.functional.conv_transpose2d(
input=input,
weight=weight,
bias=bias,
stride=stride,
padding=padding,
output_padding=output_padding,
groups=groups,
dilation=dilation)
def _should_use_custom_op(input):
assert isinstance(input, torch.Tensor)
if (not enabled) or (not torch.backends.cudnn.enabled):
return False
if input.device.type != 'cuda':
return False
return True
def _to_tuple(x, ndim):
xs = tuple(x) if isinstance(x, (tuple, list)) else (x, ) * ndim
assert len(xs) == ndim
assert all(isinstance(x, int) for x in xs)
return xs
_conv2d_gradfix_cache: Dict = dict()
_null_tensor = torch.empty([0])
def _conv2d_gradfix(
transpose: bool,
weight_shape: Tuple[int, ...],
stride: Union[int, Tuple[int, ...]],
padding: Union[int, Tuple[int, ...]],
output_padding: Union[int, Tuple[int, ...]],
dilation: Union[int, Tuple[int, ...]],
groups: int,
):
# Parse arguments.
ndim = 2
weight_shape = tuple(weight_shape)
stride = _to_tuple(stride, ndim)
padding = _to_tuple(padding, ndim)
output_padding = _to_tuple(output_padding, ndim)
dilation = _to_tuple(dilation, ndim)
# Lookup from cache.
key = (transpose, weight_shape, stride, padding, output_padding, dilation,
groups)
if key in _conv2d_gradfix_cache:
return _conv2d_gradfix_cache[key]
# Validate arguments.
assert groups >= 1
assert len(weight_shape) == ndim + 2
assert all(stride[i] >= 1 for i in range(ndim)) # type: ignore
assert all(padding[i] >= 0 for i in range(ndim)) # type: ignore
assert all(dilation[i] >= 0 for i in range(ndim)) # type: ignore
if not transpose:
assert all(output_padding[i] == 0 for i in range(ndim)) # type: ignore
else: # transpose
for i in range(ndim):
assert 0 <= output_padding[i] < max( # type: ignore
stride[i], # type: ignore
dilation[i]) # type: ignore
# Helpers.
common_kwargs = dict(
stride=stride, padding=padding, dilation=dilation, groups=groups)
def calc_output_padding(input_shape, output_shape):
if transpose:
return [0, 0]
return [
input_shape[i + 2] - (output_shape[i + 2] - 1) * stride[i] -
(1 - 2 * padding[i]) - dilation[i] * (weight_shape[i + 2] - 1)
for i in range(ndim)
]
# Forward & backward.
class Conv2d(torch.autograd.Function):
@staticmethod
def forward(ctx, input, weight, bias):
assert weight.shape == weight_shape
ctx.save_for_backward(
input if weight.requires_grad else _null_tensor,
weight if input.requires_grad else _null_tensor,
)
ctx.input_shape = input.shape
# Simple 1x1 convolution => cuBLAS (only on Volta, not on Ampere).
if weight_shape[2:] == stride == dilation == (
1, 1) and padding == (
0, 0) and torch.cuda.get_device_capability(
input.device) < (8, 0):
a = weight.reshape(groups, weight_shape[0] // groups,
weight_shape[1])
b = input.reshape(input.shape[0], groups,
input.shape[1] // groups, -1)
c = (a.transpose(1, 2) if transpose else a) @ b.permute(
1, 2, 0, 3).flatten(2)
c = c.reshape(-1, input.shape[0],
*input.shape[2:]).transpose(0, 1)
c = c if bias is None else c + bias.unsqueeze(0).unsqueeze(
2).unsqueeze(3)
return c.contiguous(
memory_format=(torch.channels_last if input.stride(1) ==
1 else torch.contiguous_format))
# General case => cuDNN.
if transpose:
return torch.nn.functional.conv_transpose2d(
input=input,
weight=weight,
bias=bias,
output_padding=output_padding,
**common_kwargs)
return torch.nn.functional.conv2d(
input=input, weight=weight, bias=bias, **common_kwargs)
@staticmethod
def backward(ctx, grad_output):
input, weight = ctx.saved_tensors
input_shape = ctx.input_shape
grad_input = None
grad_weight = None
grad_bias = None
if ctx.needs_input_grad[0]:
p = calc_output_padding(
input_shape=input_shape, output_shape=grad_output.shape)
op = _conv2d_gradfix(
transpose=(not transpose),
weight_shape=weight_shape,
output_padding=p,
**common_kwargs)
grad_input = op.apply(grad_output, weight, None)
assert grad_input.shape == input_shape
if ctx.needs_input_grad[1] and not weight_gradients_disabled:
grad_weight = Conv2dGradWeight.apply(grad_output, input)
assert grad_weight.shape == weight_shape
if ctx.needs_input_grad[2]:
grad_bias = grad_output.sum([0, 2, 3])
return grad_input, grad_weight, grad_bias
# Gradient with respect to the weights.
class Conv2dGradWeight(torch.autograd.Function):
@staticmethod
def forward(ctx, grad_output, input):
ctx.save_for_backward(
grad_output if input.requires_grad else _null_tensor,
input if grad_output.requires_grad else _null_tensor,
)
ctx.grad_output_shape = grad_output.shape
ctx.input_shape = input.shape
# Simple 1x1 convolution => cuBLAS (on both Volta and Ampere).
if weight_shape[2:] == stride == dilation == (
1, 1) and padding == (0, 0):
a = grad_output.reshape(grad_output.shape[0], groups,
grad_output.shape[1] // groups,
-1).permute(1, 2, 0, 3).flatten(2)
b = input.reshape(input.shape[0], groups,
input.shape[1] // groups,
-1).permute(1, 2, 0, 3).flatten(2)
c = (b @ a.transpose(1, 2) if transpose else
a @ b.transpose(1, 2)).reshape(weight_shape)
return c.contiguous(
memory_format=(torch.channels_last if input.stride(1) ==
1 else torch.contiguous_format))
# PyTorch consolidated convolution backward API in PR:
# https://github.com/pytorch/pytorch/commit/3dc3651e0ee3623f669c3a2c096408dbc476d122 # noqa: E501
# Enhance the code referring to the discussion:
# https://github.com/pytorch/pytorch/issues/74437
if digit_version(torch.__version__) >= digit_version('1.11.0'):
empty_weight = torch.tensor(
0.0, dtype=input.dtype,
device=input.device).expand(weight_shape)
output_padding = calc_output_padding(input.shape,
grad_output.shape)
return torch.ops.aten.convolution_backward(
grad_output,
input,
empty_weight,
None,
stride=stride,
dilation=dilation,
transposed=transpose,
padding=padding,
groups=groups,
output_padding=output_padding,
output_mask=[0, 1, 0])[1]
else:
is_rocm_pytorch = False
try:
from torch.utils.cpp_extension import ROCM_HOME
is_rocm_pytorch = True if ((torch.version.hip is not None) and
(ROCM_HOME is not None)) else False
except ImportError:
pass
name=''
flags=[]
if is_rocm_pytorch:
name = ('aten::miopen_convolution_transpose_backward_weight'
if transpose else
'aten::miopen_convolution_backward_weight')
flags = [
torch.backends.cudnn.benchmark,
torch.backends.cudnn.deterministic
]
else:
# General case => cuDNN.
name = ('aten::cudnn_convolution_transpose_backward_weight'
if transpose else
'aten::cudnn_convolution_backward_weight')
flags = [
torch.backends.cudnn.benchmark,
torch.backends.cudnn.deterministic,
torch.backends.cudnn.allow_tf32
]
return torch._C._jit_get_operation(name)(weight_shape,
grad_output, input,
padding, stride,
dilation, groups,
*flags)
@staticmethod
def backward(ctx, grad2_grad_weight):
grad_output, input = ctx.saved_tensors
grad_output_shape = ctx.grad_output_shape
input_shape = ctx.input_shape
grad2_grad_output = None
grad2_input = None
if ctx.needs_input_grad[0]:
grad2_grad_output = Conv2d.apply(input, grad2_grad_weight,
None)
assert grad2_grad_output.shape == grad_output_shape
if ctx.needs_input_grad[1]:
p = calc_output_padding(
input_shape=input_shape, output_shape=grad_output_shape)
op = _conv2d_gradfix(
transpose=(not transpose),
weight_shape=weight_shape,
output_padding=p,
**common_kwargs)
grad2_input = op.apply(grad_output, grad2_grad_weight, None)
assert grad2_input.shape == input_shape
return grad2_grad_output, grad2_input
_conv2d_gradfix_cache[key] = Conv2d
return Conv2d
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Tuple
import torch
from ..utils import ext_loader
ext_module = ext_loader.load_ext('_ext', ['convex_iou', 'convex_giou'])
def convex_giou(pointsets: torch.Tensor,
polygons: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
"""Return generalized intersection-over-union (Jaccard index) between point
sets and polygons.
Args:
pointsets (torch.Tensor): It has shape (N, 18),
indicating (x1, y1, x2, y2, ..., x9, y9) for each row.
polygons (torch.Tensor): It has shape (N, 8),
indicating (x1, y1, x2, y2, x3, y3, x4, y4) for each row.
Returns:
tuple[torch.Tensor, torch.Tensor]: The first element is the gious
between point sets and polygons with the shape (N,). The second
element is the gradient of point sets with the shape (N, 18).
"""
output = pointsets.new_zeros((pointsets.size(0), 19))
ext_module.convex_giou(pointsets, polygons, output)
convex_giou = output[:, -1]
points_grad = output[:, 0:-1]
return convex_giou, points_grad
def convex_iou(pointsets: torch.Tensor,
polygons: torch.Tensor) -> torch.Tensor:
"""Return intersection-over-union (Jaccard index) between point sets and
polygons.
Args:
pointsets (torch.Tensor): It has shape (N, 18),
indicating (x1, y1, x2, y2, ..., x9, y9) for each row.
polygons (torch.Tensor): It has shape (K, 8),
indicating (x1, y1, x2, y2, x3, y3, x4, y4) for each row.
Returns:
torch.Tensor: Return the ious between point sets and polygons with the
shape (N, K).
"""
N, K = pointsets.size(0), polygons.size(0)
ious = pointsets.new_zeros((N, K))
ext_module.convex_iou(pointsets, polygons, ious)
return ious
# Copyright (c) OpenMMLab. All rights reserved.
import torch
from torch import Tensor, nn
from mmengine.utils import digit_version
from torch import nn
from torch.autograd import Function
from ..utils import ext_loader
ext_module = ext_loader.load_ext('_ext', [
'top_pool_forward', 'top_pool_backward', 'bottom_pool_forward',
'bottom_pool_backward', 'left_pool_forward', 'left_pool_backward',
'right_pool_forward', 'right_pool_backward'
])
_mode_dict = {'top': 0, 'bottom': 1, 'left': 2, 'right': 3}
def _corner_pool(x: Tensor, dim: int, flip: bool) -> Tensor:
size = x.size(dim)
output = x.clone()
class TopPoolFunction(Function):
ind = 1
while ind < size:
if flip:
cur_start = 0
cur_len = size - ind
next_start = ind
next_len = size - ind
else:
cur_start = ind
cur_len = size - ind
next_start = 0
next_len = size - ind
@staticmethod
def symbolic(g, input):
output = g.op(
'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['top']))
return output
@staticmethod
def forward(ctx, input):
output = ext_module.top_pool_forward(input)
ctx.save_for_backward(input)
return output
# max_temp should be cloned for backward computation
max_temp = output.narrow(dim, cur_start, cur_len).clone()
cur_temp = output.narrow(dim, cur_start, cur_len)
next_temp = output.narrow(dim, next_start, next_len)
@staticmethod
def backward(ctx, grad_output):
input, = ctx.saved_tensors
output = ext_module.top_pool_backward(input, grad_output)
return output
cur_temp[...] = torch.where(max_temp > next_temp, max_temp, next_temp)
ind = ind << 1
class BottomPoolFunction(Function):
@staticmethod
def symbolic(g, input):
output = g.op(
'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['bottom']))
return output
@staticmethod
def forward(ctx, input):
output = ext_module.bottom_pool_forward(input)
ctx.save_for_backward(input)
return output
@staticmethod
def backward(ctx, grad_output):
input, = ctx.saved_tensors
output = ext_module.bottom_pool_backward(input, grad_output)
return output
class LeftPoolFunction(Function):
@staticmethod
def symbolic(g, input):
output = g.op(
'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['left']))
return output
@staticmethod
def forward(ctx, input):
output = ext_module.left_pool_forward(input)
ctx.save_for_backward(input)
return output
@staticmethod
def backward(ctx, grad_output):
input, = ctx.saved_tensors
output = ext_module.left_pool_backward(input, grad_output)
return output
class RightPoolFunction(Function):
@staticmethod
def symbolic(g, input):
output = g.op(
'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['right']))
return output
@staticmethod
def forward(ctx, input):
output = ext_module.right_pool_forward(input)
ctx.save_for_backward(input)
return output
@staticmethod
def backward(ctx, grad_output):
input, = ctx.saved_tensors
output = ext_module.right_pool_backward(input, grad_output)
return output
......@@ -40,13 +104,11 @@ class CornerPool(nn.Module):
Corner Pooling is a new type of pooling layer that helps a
convolutional network better localize corners of bounding boxes.
Please refer to `CornerNet: Detecting Objects as Paired Keypoints
<https://arxiv.org/abs/1808.01244>`_ for more details.
Please refer to https://arxiv.org/abs/1808.01244 for more details.
Code is modified from https://github.com/princeton-vl/CornerNet-Lite.
Args:
mode (str): Pooling orientation for the pooling layer
mode(str): Pooling orientation for the pooling layer
- 'bottom': Bottom Pooling
- 'left': Left Pooling
......@@ -57,6 +119,13 @@ class CornerPool(nn.Module):
Feature map after pooling.
"""
pool_functions = {
'bottom': BottomPoolFunction,
'left': LeftPoolFunction,
'right': RightPoolFunction,
'top': TopPoolFunction,
}
cummax_dim_flip = {
'bottom': (2, False),
'left': (3, True),
......@@ -64,13 +133,23 @@ class CornerPool(nn.Module):
'top': (2, True),
}
def __init__(self, mode: str):
super().__init__()
assert mode in self.cummax_dim_flip
def __init__(self, mode):
super(CornerPool, self).__init__()
assert mode in self.pool_functions
self.mode = mode
self.corner_pool = self.pool_functions[mode]
def forward(self, x):
if torch.__version__ != 'parrots' and torch.__version__ >= '1.5.0':
if torch.onnx.is_in_onnx_export():
assert torch.__version__ >= '1.7.0', \
'When `cummax` serves as an intermediate component whose '\
'outputs is used as inputs for another modules, it\'s '\
'expected that pytorch version must be >= 1.7.0, '\
'otherwise Error appears like: `RuntimeError: tuple '\
'appears in op that does not forward tuples, unsupported '\
'kind: prim::PythonOp`.'
def forward(self, x: Tensor) -> Tensor:
if torch.__version__ != 'parrots' and digit_version(torch.__version__) >= digit_version('1.5.0'):
dim, flip = self.cummax_dim_flip[self.mode]
if flip:
x = x.flip(dim)
......@@ -79,5 +158,4 @@ class CornerPool(nn.Module):
pool_tensor = pool_tensor.flip(dim)
return pool_tensor
else:
dim, flip = self.cummax_dim_flip[self.mode]
return _corner_pool(x, dim, flip)
return self.corner_pool.apply(x)
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Tuple
import torch
from torch import Tensor, nn
from torch.autograd import Function
......@@ -17,14 +15,14 @@ class CorrelationFunction(Function):
@staticmethod
def forward(ctx,
input1: Tensor,
input2: Tensor,
kernel_size: int = 1,
max_displacement: int = 1,
stride: int = 1,
padding: int = 1,
dilation: int = 1,
dilation_patch: int = 1) -> Tensor:
input1,
input2,
kernel_size=1,
max_displacement=1,
stride=1,
padding=1,
dilation=1,
dilation_patch=1):
ctx.save_for_backward(input1, input2)
......@@ -62,9 +60,7 @@ class CorrelationFunction(Function):
@staticmethod
@once_differentiable
def backward(
ctx, grad_output: Tensor
) -> Tuple[Tensor, Tensor, None, None, None, None, None, None]:
def backward(ctx, grad_output):
input1, input2 = ctx.saved_tensors
kH, kW = ctx.kernel_size
......
......@@ -13,56 +13,64 @@ This folder contains all non-python code for MMCV custom ops. Please follow the
│ ├── pytorch_cpp_helper.hpp
│ ├── pytorch_cuda_helper.hpp
│ ├── pytorch_device_registry.hpp
│   ├── cuda
│   │ ├── common_cuda_helper.hpp
│   │ ├── parrots_cudawarpfunction.cuh
│   │ ├── ...
│   │ └── ops_cuda_kernel.cuh
|   ├── mps
│   │ ├── MPSLibrary.h
│   │ ├── ...
│   │ └── MPSUtils.h
|   ├── mlu
│   │ └── ...
|   └── utils
│   │ └── ...
│   └── cuda
│   ├── common_cuda_helper.hpp
│   ├── parrots_cudawarpfunction.cuh
│   ├── ...
│   └── ops_cuda_kernel.cuh
├── onnxruntime
│   ├── onnxruntime_register.h
│   ├── onnxruntime_session_options_config_keys.h
│   ├── ort_mmcv_utils.h
│   ├── ...
│   ├── onnx_ops.h
│   └── cpu
│ ├── onnxruntime_register.cpp
│      ├── ...
│      └── onnx_ops_impl.cpp
├── parrots
│   ├── ...
│   ├── ops.cpp
│   ├── ops_parrots.cpp
│   └── ops_pytorch.h
└── pytorch
    ├── info.cpp
    ├── pybind.cpp
    ├── ...
    ├── ops.cpp
    ├── cuda
    │   ├── ...
    │   └── ops_cuda.cu
    ├── cpu
    │   ├── ...
    │   └── ops.cpp
    ├── mps
    │   ├── ...
    |   └── op_mps.mm
    └── mlu
       ├── ...
       └── op_mlu.cpp
├── pytorch
│   ├── info.cpp
│   ├── pybind.cpp
│   ├── ...
│   ├── ops.cpp
│   ├── cuda
│   │   ├── ...
│   │   └── ops_cuda.cu
│   └── cpu
│      ├── ...
│      └── ops.cpp
└── tensorrt
├── trt_cuda_helper.cuh
├── trt_plugin_helper.hpp
├── trt_plugin.hpp
├── trt_serialize.hpp
├── ...
├── trt_ops.hpp
└── plugins
   ├── trt_cuda_helper.cu
   ├── trt_plugin.cpp
   ├── ...
   ├── trt_ops.cpp
   └── trt_ops_kernel.cu
```
## Components
- `common`: This directory contains all tools and shared codes.
- `cuda`: The cuda kernels which can be shared by all backends. **HIP** kernel is also here since they have similar syntax.
- `mps`: The tools used to support MPS ops. **NOTE** that MPS support is **experimental**.
- `mlu`: The MLU kernels used to support [Cambricon](https://www.cambricon.com/) device.
- `utils`: The kernels and utils of spconv.
- `onnxruntime`: **ONNX Runtime** support for custom ops.
- `cpu`: CPU implementation of supported ops.
- `parrots`: **Parrots** is a deep learning frame for model training and inference. Parrots custom ops are placed in this directory.
- `pytorch`: **PyTorch** custom ops are supported by binding C++ to Python with **pybind11**. The ops implementation and binding codes are placed in this directory.
- `cuda`: This directory contains cuda kernel launchers, which feed memory pointers of tensor to the cuda kernel in `common/cuda`. The launchers provide c++ interface of cuda implementation of corresponding custom ops.
- `cpu`: This directory contain cpu implementations of corresponding custom ops.
- `mlu`: This directory contain launchers of each MLU kernels.
- `mps`: MPS ops implementation and launchers.
- `tensorrt`: **TensorRT** support for custom ops.
- `plugins`: This directory contains the implementation of the supported custom ops. Some ops might also use shared cuda kernel in `common/cuda`.
## How to add new PyTorch ops?
......
......@@ -220,10 +220,6 @@ HOST_DEVICE_INLINE int convex_hull_graham(const Point<T> (&p)[24],
return temp > 0;
}
});
// compute distance to origin after sort, since the points are now different.
for (int i = 0; i < num_in; i++) {
dist[i] = dot_2d<T>(q[i], q[i]);
}
#endif
// Step 4:
......@@ -270,17 +266,6 @@ HOST_DEVICE_INLINE int convex_hull_graham(const Point<T> (&p)[24],
return m;
}
template <typename T>
HOST_DEVICE_INLINE T quadri_box_area(const Point<T> (&q)[4]) {
T area = 0;
#pragma unroll
for (int i = 1; i < 3; i++) {
area += fabs(cross_2d<T>(q[i] - q[0], q[i + 1] - q[0]));
}
return area / 2.0;
}
template <typename T>
HOST_DEVICE_INLINE T polygon_area(const Point<T> (&q)[24], const int& m) {
if (m <= 2) {
......@@ -319,25 +304,6 @@ HOST_DEVICE_INLINE T rotated_boxes_intersection(const RotatedBox<T>& box1,
return polygon_area<T>(orderedPts, num_convex);
}
template <typename T>
HOST_DEVICE_INLINE T quadri_boxes_intersection(const Point<T> (&pts1)[4],
const Point<T> (&pts2)[4]) {
// There are up to 4 x 4 + 4 + 4 = 24 intersections (including dups) returned
// from rotated_rect_intersection_pts
Point<T> intersectPts[24], orderedPts[24];
int num = get_intersection_points<T>(pts1, pts2, intersectPts);
if (num <= 2) {
return 0.0;
}
// Convex Hull to order the intersection points in clockwise order and find
// the contour area.
int num_convex = convex_hull_graham<T>(intersectPts, num, orderedPts, true);
return polygon_area<T>(orderedPts, num_convex);
}
} // namespace
template <typename T>
......@@ -375,52 +341,3 @@ HOST_DEVICE_INLINE T single_box_iou_rotated(T const* const box1_raw,
const T iou = intersection / baseS;
return iou;
}
template <typename T>
HOST_DEVICE_INLINE T single_box_iou_quadri(T const* const pts1_raw,
T const* const pts2_raw,
const int mode_flag) {
// shift center to the middle point to achieve higher precision in result
Point<T> pts1[4], pts2[4];
auto center_shift_x =
(pts1_raw[0] + pts2_raw[0] + pts1_raw[2] + pts2_raw[2] + pts1_raw[4] +
pts2_raw[4] + pts1_raw[6] + pts2_raw[6]) /
8.0;
auto center_shift_y =
(pts1_raw[1] + pts2_raw[1] + pts1_raw[3] + pts2_raw[3] + pts1_raw[5] +
pts2_raw[5] + pts1_raw[7] + pts2_raw[7]) /
8.0;
pts1[0].x = pts1_raw[0] - center_shift_x;
pts1[0].y = pts1_raw[1] - center_shift_y;
pts1[1].x = pts1_raw[2] - center_shift_x;
pts1[1].y = pts1_raw[3] - center_shift_y;
pts1[2].x = pts1_raw[4] - center_shift_x;
pts1[2].y = pts1_raw[5] - center_shift_y;
pts1[3].x = pts1_raw[6] - center_shift_x;
pts1[3].y = pts1_raw[7] - center_shift_y;
pts2[0].x = pts2_raw[0] - center_shift_x;
pts2[0].y = pts2_raw[1] - center_shift_y;
pts2[1].x = pts2_raw[2] - center_shift_x;
pts2[1].y = pts2_raw[3] - center_shift_y;
pts2[2].x = pts2_raw[4] - center_shift_x;
pts2[2].y = pts2_raw[5] - center_shift_y;
pts2[3].x = pts2_raw[6] - center_shift_x;
pts2[3].y = pts2_raw[7] - center_shift_y;
const T area1 = quadri_box_area<T>(pts1);
const T area2 = quadri_box_area<T>(pts2);
if (area1 < 1e-14 || area2 < 1e-14) {
return 0.f;
}
const T intersection = quadri_boxes_intersection<T>(pts1, pts2);
T baseS = 1.0;
if (mode_flag == 0) {
baseS = (area1 + area2 - intersection);
} else if (mode_flag == 1) {
baseS = area1;
}
const T iou = intersection / baseS;
return iou;
}
// Copyright (c) OpenMMLab. All rights reserved.
// Modified from
// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/cuda/ActiveRotatingFilter_cuda.cu
#ifndef ACTIVE_ROTATED_FILTER_CUDA_KERNEL_CUH
#define ACTIVE_ROTATED_FILTER_CUDA_KERNEL_CUH
#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif
template <typename scalar_t>
__global__ void active_rotated_filter_forward_cuda_kernel(
const int nthreads, const scalar_t* weight_data, const int* indices_data,
const int num_input_planes, const int num_output_planes,
const int num_orientations, const int num_rotations, const int nEntry,
scalar_t* output_data) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
int l = index % nEntry;
int j = (index / nEntry) % num_input_planes;
int i = index / nEntry / num_input_planes;
int k;
scalar_t val = *(weight_data + index);
for (k = 0; k < num_rotations; k++) {
int idx = (int)(*(indices_data + l * num_rotations + k)) - 1;
scalar_t* target = output_data +
i * (num_rotations * num_input_planes * nEntry) +
k * (num_input_planes * nEntry) + j * (nEntry) + idx;
*target = val;
}
}
}
template <typename scalar_t>
__global__ void active_rotated_filter_backward_cuda_kernel(
const int nthreads, const scalar_t* gradWeight_data,
const int* indices_data, const int num_input_planes,
const int num_output_planes, const int num_orientations,
const int num_rotations, const int nEntry, scalar_t* weight_data) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
int l = index % nEntry;
int j = (index / nEntry) % num_input_planes;
int i = index / nEntry / num_input_planes;
int k;
scalar_t* val = weight_data + index;
*val = 0;
scalar_t tmp = 0;
for (k = 0; k < num_rotations; k++) {
int idx = (int)(*(indices_data + l * num_rotations + k)) - 1;
scalar_t target =
*(gradWeight_data + i * (num_rotations * num_input_planes * nEntry) +
k * (num_input_planes * nEntry) + j * (nEntry) + idx);
tmp = tmp + target;
}
*val = tmp;
}
}
#endif // ACTIVE_ROTATED_FILTER_CUDA_KERNEL_CUH
......@@ -22,7 +22,8 @@ __global__ void assign_score_withk_forward_cuda_kernel(
const int O, const int aggregate, const T* points, const T* centers,
const T* scores, const int64_t* knn_idx, T* output) {
// ----- parallel loop for B, N1, K and O ---------
CUDA_1D_KERNEL_LOOP(i, B * O * N1 * K) {
long i = blockIdx.x * blockDim.x + threadIdx.x;
if (i >= B * N1 * K * O) return;
// ------- loop for M ----------
const int b = (int)(i / (O * N1 * K));
const int o = (int)(i % (O * N1 * K) / (N1 * K));
......@@ -49,7 +50,6 @@ __global__ void assign_score_withk_forward_cuda_kernel(
scores[b * N1 * K * M + n * K * M + k * M + m];
}
output[out_idx] = val;
}
}
template <typename T>
......@@ -58,7 +58,8 @@ __global__ void assign_score_withk_points_backward_cuda_kernel(
const int O, const int aggregate, const T* grad_out, const T* scores,
const int64_t* knn_idx, T* grad_points, T* grad_centers) {
// ----- parallel loop for B, M, O ---------
CUDA_1D_KERNEL_LOOP(i, B * M * O) {
long i = blockIdx.x * blockDim.x + threadIdx.x;
if (i >= B * M * O) return;
int b = (int)(i / (M * O));
int m = (int)(i % (M * O) / O);
int o = (int)(i % O);
......@@ -68,8 +69,8 @@ __global__ void assign_score_withk_points_backward_cuda_kernel(
for (int k = 0; k < K; k++) {
int kn = knn_idx[b * N * K + n * K + k];
int cn = knn_idx[b * N * K + n * K + 0];
if (kn >= N0 || kn < 0) { // if index overflows, it is out of the
// neighborhood range
if (kn >= N0 ||
kn < 0) { // if index overflows, it is out of the neighborhood range
continue;
}
atomicAdd(grad_points + b * N0 * M * O + kn * M * O + m * O + o,
......@@ -80,7 +81,6 @@ __global__ void assign_score_withk_points_backward_cuda_kernel(
grad_out[b * O * N * K + o * N * K + n * K + k]);
}
}
}
}
template <typename T>
......@@ -89,7 +89,8 @@ __global__ void assign_score_withk_scores_backward_cuda_kernel(
const int O, const int aggregate, const T* grad_out, const T* points,
const T* centers, const int64_t* knn_idx, T* grad_scores) {
// ----- parallel loop for B, N, K, M ---------
CUDA_1D_KERNEL_LOOP(i, B * N * K * M) {
long i = blockIdx.x * blockDim.x + threadIdx.x;
if (i >= B * N * K * M) return;
const int b = (int)(i / (N * M * K));
const int n = (int)(i % (N * M * K) / M / K);
const int k = (int)(i % (M * K) / M);
......@@ -110,7 +111,6 @@ __global__ void assign_score_withk_scores_backward_cuda_kernel(
grad_out[b * O * N * K + o * N * K + n * K + k];
}
grad_scores[out_idx] = val;
}
}
#endif // ASSIGN_SCORE_WITHK_CUDA_KERNEL_CUH
......@@ -21,8 +21,8 @@ __global__ void ball_query_forward_cuda_kernel(int b, int n, int m,
// output:
// idx: (B, M, nsample)
int bs_idx = blockIdx.y;
CUDA_1D_KERNEL_LOOP(pt_idx, m) {
if (bs_idx >= b) return;
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (bs_idx >= b || pt_idx >= m) return;
new_xyz += bs_idx * m * 3 + pt_idx * 3;
xyz += bs_idx * n * 3;
......@@ -52,7 +52,6 @@ __global__ void ball_query_forward_cuda_kernel(int b, int n, int m,
if (cnt >= nsample) break;
}
}
}
}
#endif // BALL_QUERY_CUDA_KERNEL_CUH
......@@ -8,27 +8,6 @@
#include "pytorch_cuda_helper.hpp"
#endif
template <typename T>
__device__ __forceinline__ void load_bbox(const T* bbox, const int base, T& x1,
T& y1, T& x2, T& y2) {
x1 = bbox[base];
y1 = bbox[base + 1];
x2 = bbox[base + 2];
y2 = bbox[base + 3];
}
template <>
__device__ __forceinline__ void load_bbox<float>(const float* bbox,
const int base, float& x1,
float& y1, float& x2,
float& y2) {
const float4 bbox_offset = reinterpret_cast<const float4*>(bbox + base)[0];
x1 = bbox_offset.x;
y1 = bbox_offset.y;
x2 = bbox_offset.z;
y2 = bbox_offset.w;
}
template <typename T>
__global__ void bbox_overlaps_cuda_kernel(const T* bbox1, const T* bbox2,
T* ious, const int num_bbox1,
......@@ -37,111 +16,69 @@ __global__ void bbox_overlaps_cuda_kernel(const T* bbox1, const T* bbox2,
const int offset) {
if (aligned) {
CUDA_1D_KERNEL_LOOP(index, num_bbox1) {
const int b1 = index;
const int b2 = index;
const int base1 = b1 << 2; // b1 * 4
T b1_x1, b1_y1, b1_x2, b1_y2;
load_bbox<T>(bbox1, base1, b1_x1, b1_y1, b1_x2, b1_y2);
const T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset);
const int base2 = b2 << 2; // b2 * 4
T b2_x1, b2_y1, b2_x2, b2_y2;
load_bbox<T>(bbox2, base2, b2_x1, b2_y1, b2_x2, b2_y2);
const T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset);
const T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2);
const T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2);
const T width = fmaxf(right - left + offset, 0.f);
const T height = fmaxf(bottom - top + offset, 0.f);
const T interS = width * height;
const T baseS =
fmaxf(mode == 0 ? b1_area + b2_area - interS : b1_area, T(offset));
int b1 = index;
int b2 = index;
int base1 = b1 * 4;
T b1_x1 = bbox1[base1];
T b1_y1 = bbox1[base1 + 1];
T b1_x2 = bbox1[base1 + 2];
T b1_y2 = bbox1[base1 + 3];
T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset);
int base2 = b2 * 4;
T b2_x1 = bbox2[base2];
T b2_y1 = bbox2[base2 + 1];
T b2_x2 = bbox2[base2 + 2];
T b2_y2 = bbox2[base2 + 3];
T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset);
T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2);
T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2);
T width = fmaxf(right - left + offset, 0.f);
T height = fmaxf(bottom - top + offset, 0.f);
T interS = width * height;
T baseS = 1.0;
if (mode == 0) {
baseS = fmaxf(b1_area + b2_area - interS, T(offset));
} else if (mode == 1) {
baseS = fmaxf(b1_area, T(offset));
}
ious[index] = interS / baseS;
}
} else {
CUDA_1D_KERNEL_LOOP(index, num_bbox1 * num_bbox2) {
const int b1 = index / num_bbox2;
const int b2 = index % num_bbox2;
const int base1 = b1 << 2; // b1 * 4
T b1_x1, b1_y1, b1_x2, b1_y2;
load_bbox<T>(bbox1, base1, b1_x1, b1_y1, b1_x2, b1_y2);
const T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset);
const int base2 = b2 << 2; // b2 * 4
T b2_x1, b2_y1, b2_x2, b2_y2;
load_bbox<T>(bbox2, base2, b2_x1, b2_y1, b2_x2, b2_y2);
const T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset);
const T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2);
const T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2);
const T width = fmaxf(right - left + offset, 0.f);
const T height = fmaxf(bottom - top + offset, 0.f);
const T interS = width * height;
const T baseS =
fmaxf(mode == 0 ? b1_area + b2_area - interS : b1_area, T(offset));
ious[index] = interS / baseS;
int b1 = index / num_bbox2;
int b2 = index % num_bbox2;
int base1 = b1 * 4;
T b1_x1 = bbox1[base1];
T b1_y1 = bbox1[base1 + 1];
T b1_x2 = bbox1[base1 + 2];
T b1_y2 = bbox1[base1 + 3];
T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset);
int base2 = b2 * 4;
T b2_x1 = bbox2[base2];
T b2_y1 = bbox2[base2 + 1];
T b2_x2 = bbox2[base2 + 2];
T b2_y2 = bbox2[base2 + 3];
T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset);
T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2);
T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2);
T width = fmaxf(right - left + offset, 0.f);
T height = fmaxf(bottom - top + offset, 0.f);
T interS = width * height;
T baseS = 1.0;
if (mode == 0) {
baseS = fmaxf(b1_area + b2_area - interS, T(offset));
} else if (mode == 1) {
baseS = fmaxf(b1_area, T(offset));
}
ious[index] = interS / baseS;
}
}
#if __CUDA_ARCH__ >= 530
__device__ __forceinline__ __half __half_area(const __half x1, const __half y1,
const __half x2, const __half y2,
const __half offset) {
const __half half_w = __hadd(__hsub(x2, x1), offset);
const __half half_h = __hadd(__hsub(y2, y1), offset);
return __hmul(half_w, half_h);
}
__device__ __forceinline__ __half __half_max(const __half a, const __half b) {
return __hge(a, b) ? a : b;
}
__device__ __forceinline__ __half __half_min(const __half a, const __half b) {
return __hle(a, b) ? a : b;
}
// fp16 won't provide much increase when aligned==true. It is useful when
// aligned==false, which would give you ~40% bonus.
__device__ void bbox_overlaps_cuda_kernel_half(
const __half* bbox1, const __half* bbox2, __half* ious, const int num_bbox1,
const int num_bbox2, const int mode, const bool aligned, const int offset) {
const int num_output = aligned ? num_bbox1 : num_bbox1 * num_bbox2;
const __half h_offset = __int2half_rn(offset);
CUDA_1D_KERNEL_LOOP(index, num_output) {
const int b1 = aligned ? index : index / num_bbox2;
const int b2 = aligned ? index : index % num_bbox2;
const int base1 = b1 << 2;
__half b1_x1, b1_y1, b1_x2, b1_y2;
load_bbox<__half>(bbox1, base1, b1_x1, b1_y1, b1_x2, b1_y2);
const __half b1_area = __half_area(b1_x1, b1_y1, b1_x2, b1_y2, h_offset);
const int base2 = b2 << 2;
__half b2_x1, b2_y1, b2_x2, b2_y2;
load_bbox<__half>(bbox2, base2, b2_x1, b2_y1, b2_x2, b2_y2);
const __half b2_area = __half_area(b2_x1, b2_y1, b2_x2, b2_y2, h_offset);
const __half left = __half_max(b1_x1, b2_x1),
right = __half_min(b1_x2, b2_x2);
const __half top = __half_max(b1_y1, b2_y1),
bottom = __half_min(b1_y2, b2_y2);
const __half width =
__half_max(__hadd(__hsub(right, left), h_offset), __float2half(0.f));
const __half height =
__half_max(__hadd(__hsub(bottom, top), h_offset), __float2half(0.f));
const __half interS = __hmul(width, height);
const __half baseS = __half_max(
mode == 0 ? __hsub(__hadd(b1_area, b2_area), interS) : b1_area,
h_offset);
ious[index] = __hdiv(interS, baseS);
}
}
#endif // __CUDA_ARCH__ >= 530
#endif // BBOX_OVERLAPS_CUDA_KERNEL_CUH
// Copyright (c) OpenMMLab. All rights reserved
// Modified from
// https://github.com/aim-uofa/AdelaiDet/blob/master/adet/layers/csrc/BezierAlign/BezierAlign_cuda.cu
#ifndef BEZIER_ALIGN_CUDA_KERNEL_CUH
#define BEZIER_ALIGN_CUDA_KERNEL_CUH
#include <float.h>
#ifdef MMCV_WITH_TRT
#include "common_cuda_helper.hpp"
#else // MMCV_WITH_TRT
#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else // MMCV_USE_PARROTS
#include "pytorch_cuda_helper.hpp"
#endif // MMCV_USE_PARROTS
#endif // MMCV_WITH_TRT
template <typename T>
__device__ T bezier_curve(const T p0, const T p1, const T p2, const T p3,
const T u) {
return ((1. - u) * (1. - u) * (1. - u) * p0 +
3. * u * (1. - u) * (1. - u) * p1 + 3. * u * u * (1. - u) * p2 +
u * u * u * p3);
}
template <typename T>
__global__ void bezier_align_forward_cuda_kernel(
const int nthreads,
const T *bottom_data, // inputs
const T *bottom_rois, // bottom rois contains the bezier curve
T *top_data, // outputs
const int pooled_height, const int pooled_width, const T spatial_scale,
const int sampling_ratio, bool aligned, const int channels,
const int height, const int width) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int c = (index / pooled_width / pooled_height) % channels;
int n = index / pooled_width / pooled_height / channels;
// beziers have size Nx(1+8*2) = Nx17
const T *offset_bottom_rois = bottom_rois + n * 17;
int roi_batch_ind = offset_bottom_rois[0];
// Do not use rounding; this implementation detail is critical
T offset = aligned ? (T)0.5 : (T)0.0;
// TODO: avoid this by using parallel annotation, for good
T p0_x = offset_bottom_rois[1] * spatial_scale;
T p0_y = offset_bottom_rois[2] * spatial_scale;
T p1_x = offset_bottom_rois[3] * spatial_scale;
T p1_y = offset_bottom_rois[4] * spatial_scale;
T p2_x = offset_bottom_rois[5] * spatial_scale;
T p2_y = offset_bottom_rois[6] * spatial_scale;
T p3_x = offset_bottom_rois[7] * spatial_scale;
T p3_y = offset_bottom_rois[8] * spatial_scale;
T p4_x = offset_bottom_rois[15] * spatial_scale;
T p4_y = offset_bottom_rois[16] * spatial_scale;
T p5_x = offset_bottom_rois[13] * spatial_scale;
T p5_y = offset_bottom_rois[14] * spatial_scale;
T p6_x = offset_bottom_rois[11] * spatial_scale;
T p6_y = offset_bottom_rois[12] * spatial_scale;
T p7_x = offset_bottom_rois[9] * spatial_scale;
T p7_y = offset_bottom_rois[10] * spatial_scale;
// compute the coords
const T u = pw / static_cast<T>(pooled_width);
const T v = ph / static_cast<T>(pooled_height);
const T x0 = bezier_curve(p0_x, p1_x, p2_x, p3_x, u);
const T y0 = bezier_curve(p0_y, p1_y, p2_y, p3_y, u);
const T x1 = bezier_curve(p4_x, p5_x, p6_x, p7_x, u);
const T y1 = bezier_curve(p4_y, p5_y, p6_y, p7_y, u);
const T x_center = x1 * v + x0 * (1. - v) - offset;
const T y_center = y1 * v + y0 * (1. - v) - offset;
T roi_width = max(abs(p0_x - p3_x), abs(p4_x - p7_x));
T roi_height = max(abs(p0_y - p3_y), abs(p4_y - p7_y));
if (!aligned) { // for backward-compatibility only
roi_width = max(roi_width, (T)1.);
roi_height = max(roi_height, (T)1.);
}
T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
const T *offset_bottom_data =
bottom_data + (roi_batch_ind * channels + c) * height * width;
// We use roi_bin_grid to sample the grid and mimic integral
int roi_bin_grid_h = (sampling_ratio > 0)
? sampling_ratio
: ceil(roi_height / pooled_height); // e.g., = 2
int roi_bin_grid_w =
(sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
// We do average (integral) pooling inside a bin
// When the grid is empty, output zeros == 0/1, instead of NaN.
const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
T output_val = 0.;
for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
{
const T y = y_center - (T)0.5 * bin_size_h +
static_cast<T>(iy + .5f) * bin_size_h /
static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
const T x = x_center - (T)0.5 * bin_size_w +
static_cast<T>(ix + .5f) * bin_size_w /
static_cast<T>(roi_bin_grid_w);
T val = bilinear_interpolate(offset_bottom_data, height, width, y, x,
index);
output_val += val;
}
}
output_val /= count;
top_data[index] = output_val;
}
}
template <typename T>
__global__ void bezier_align_backward_cuda_kernel(
const int nthreads, const T *top_diff, const T *bottom_rois, T *bottom_diff,
const int pooled_height, const int pooled_width, const T spatial_scale,
const int sampling_ratio, bool aligned, const int channels,
const int height, const int width) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int c = (index / pooled_width / pooled_height) % channels;
int n = index / pooled_width / pooled_height / channels;
// beziers have size Nx(1+8*2) = Nx17
const T *offset_bottom_rois = bottom_rois + n * 17;
int roi_batch_ind = offset_bottom_rois[0];
// Do not use rounding; this implementation detail is critical
T offset = aligned ? (T)0.5 : (T)0.0;
T p0_x = offset_bottom_rois[1] * spatial_scale;
T p0_y = offset_bottom_rois[2] * spatial_scale;
T p1_x = offset_bottom_rois[3] * spatial_scale;
T p1_y = offset_bottom_rois[4] * spatial_scale;
T p2_x = offset_bottom_rois[5] * spatial_scale;
T p2_y = offset_bottom_rois[6] * spatial_scale;
T p3_x = offset_bottom_rois[7] * spatial_scale;
T p3_y = offset_bottom_rois[8] * spatial_scale;
T p4_x = offset_bottom_rois[15] * spatial_scale;
T p4_y = offset_bottom_rois[16] * spatial_scale;
T p5_x = offset_bottom_rois[13] * spatial_scale;
T p5_y = offset_bottom_rois[14] * spatial_scale;
T p6_x = offset_bottom_rois[11] * spatial_scale;
T p6_y = offset_bottom_rois[12] * spatial_scale;
T p7_x = offset_bottom_rois[9] * spatial_scale;
T p7_y = offset_bottom_rois[10] * spatial_scale;
// compute the coords
const T u = pw / static_cast<T>(pooled_width);
const T v = ph / static_cast<T>(pooled_height);
const T x0 = bezier_curve(p0_x, p1_x, p2_x, p3_x, u);
const T y0 = bezier_curve(p0_y, p1_y, p2_y, p3_y, u);
const T x1 = bezier_curve(p4_x, p5_x, p6_x, p7_x, u);
const T y1 = bezier_curve(p4_y, p5_y, p6_y, p7_y, u);
const T x_center = x1 * v + x0 * (1. - v) - offset;
const T y_center = y1 * v + y0 * (1. - v) - offset;
T roi_width = max(abs(p0_x - p3_x), abs(p4_x - p7_x));
T roi_height = max(abs(p0_y - p3_y), abs(p4_y - p7_y));
if (!aligned) { // for backward-compatibility only
roi_width = max(roi_width, (T)1.);
roi_height = max(roi_height, (T)1.);
}
T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
T *offset_bottom_diff =
bottom_diff + (roi_batch_ind * channels + c) * height * width;
int top_offset = (n * channels + c) * pooled_height * pooled_width;
const T *offset_top_diff = top_diff + top_offset;
const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
// We use roi_bin_grid to sample the grid and mimic integral
int roi_bin_grid_h = (sampling_ratio > 0)
? sampling_ratio
: ceil(roi_height / pooled_height); // e.g., = 2
int roi_bin_grid_w =
(sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
// We do average (integral) pooling inside a bin
const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
{
const T y = y_center - (T)0.5 * bin_size_h +
static_cast<T>(iy + .5f) * bin_size_h /
static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
const T x = x_center - (T)0.5 * bin_size_w +
static_cast<T>(ix + .5f) * bin_size_w /
static_cast<T>(roi_bin_grid_w);
T w1, w2, w3, w4;
int x_low, x_high, y_low, y_high;
bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
x_low, x_high, y_low, y_high, index);
T g1 = top_diff_this_bin * w1 / count;
T g2 = top_diff_this_bin * w2 / count;
T g3 = top_diff_this_bin * w3 / count;
T g4 = top_diff_this_bin * w4 / count;
if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
atomicAdd(offset_bottom_diff + y_low * width + x_low,
static_cast<T>(g1));
atomicAdd(offset_bottom_diff + y_low * width + x_high,
static_cast<T>(g2));
atomicAdd(offset_bottom_diff + y_high * width + x_low,
static_cast<T>(g3));
atomicAdd(offset_bottom_diff + y_high * width + x_high,
static_cast<T>(g4));
} // if
} // ix
} // iy
} // CUDA_1D_KERNEL_LOOP
} // BezierAlignBackward
#endif // BEZIER_ALIGN_CUDA_KERNEL_CUH
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#ifndef BOX_IOU_QUADRI_CUDA_CUH
#define BOX_IOU_QUADRI_CUDA_CUH
#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif
#include "box_iou_rotated_utils.hpp"
// 2D block with 32 * 16 = 512 threads per block
const int BLOCK_DIM_X = 32;
const int BLOCK_DIM_Y = 16;
inline int divideUP(const int x, const int y) { return (((x) + (y)-1) / (y)); }
template <typename T>
__global__ void box_iou_quadri_cuda_kernel(
const int n_boxes1, const int n_boxes2, const T* dev_boxes1,
const T* dev_boxes2, T* dev_ious, const int mode_flag, const bool aligned) {
if (aligned) {
CUDA_1D_KERNEL_LOOP(index, n_boxes1) {
int b1 = index;
int b2 = index;
int base1 = b1 * 8;
float block_boxes1[8];
float block_boxes2[8];
block_boxes1[0] = dev_boxes1[base1 + 0];
block_boxes1[1] = dev_boxes1[base1 + 1];
block_boxes1[2] = dev_boxes1[base1 + 2];
block_boxes1[3] = dev_boxes1[base1 + 3];
block_boxes1[4] = dev_boxes1[base1 + 4];
block_boxes1[5] = dev_boxes1[base1 + 5];
block_boxes1[6] = dev_boxes1[base1 + 6];
block_boxes1[7] = dev_boxes1[base1 + 7];
int base2 = b2 * 8;
block_boxes2[0] = dev_boxes2[base2 + 0];
block_boxes2[1] = dev_boxes2[base2 + 1];
block_boxes2[2] = dev_boxes2[base2 + 2];
block_boxes2[3] = dev_boxes2[base2 + 3];
block_boxes2[4] = dev_boxes2[base2 + 4];
block_boxes2[5] = dev_boxes2[base2 + 5];
block_boxes2[6] = dev_boxes2[base2 + 6];
block_boxes2[7] = dev_boxes2[base2 + 7];
dev_ious[index] =
single_box_iou_quadri<T>(block_boxes1, block_boxes2, mode_flag);
}
} else {
CUDA_1D_KERNEL_LOOP(index, n_boxes1 * n_boxes2) {
int b1 = index / n_boxes2;
int b2 = index % n_boxes2;
int base1 = b1 * 8;
float block_boxes1[8];
float block_boxes2[8];
block_boxes1[0] = dev_boxes1[base1 + 0];
block_boxes1[1] = dev_boxes1[base1 + 1];
block_boxes1[2] = dev_boxes1[base1 + 2];
block_boxes1[3] = dev_boxes1[base1 + 3];
block_boxes1[4] = dev_boxes1[base1 + 4];
block_boxes1[5] = dev_boxes1[base1 + 5];
block_boxes1[6] = dev_boxes1[base1 + 6];
block_boxes1[7] = dev_boxes1[base1 + 7];
int base2 = b2 * 8;
block_boxes2[0] = dev_boxes2[base2 + 0];
block_boxes2[1] = dev_boxes2[base2 + 1];
block_boxes2[2] = dev_boxes2[base2 + 2];
block_boxes2[3] = dev_boxes2[base2 + 3];
block_boxes2[4] = dev_boxes2[base2 + 4];
block_boxes2[5] = dev_boxes2[base2 + 5];
block_boxes2[6] = dev_boxes2[base2 + 6];
block_boxes2[7] = dev_boxes2[base2 + 7];
dev_ious[index] =
single_box_iou_quadri<T>(block_boxes1, block_boxes2, mode_flag);
}
}
}
#endif
......@@ -8,7 +8,7 @@
#include "pytorch_cuda_helper.hpp"
#endif
#ifdef MMCV_WITH_HIP
#ifdef HIP_DIFF
#define WARP_SIZE 64
#else
#define WARP_SIZE 32
......@@ -29,22 +29,22 @@ __device__ inline int Loc2Index(const int n, const int c, const int h,
int index = w + (h + (c + n * channel_num) * height) * width;
return index;
}
#ifndef MMCV_WITH_HIP
#ifndef HIP_DIFF
/* TODO: move this to a common place */
template <typename scalar_t>
__device__ inline scalar_t min(scalar_t a, scalar_t b) {
__device__ inline scalar_t mmcv_min(scalar_t a, scalar_t b) {
return a < b ? a : b;
}
template <typename scalar_t>
__device__ inline scalar_t max(scalar_t a, scalar_t b) {
__device__ inline scalar_t mmcv_max(scalar_t a, scalar_t b) {
return a > b ? a : b;
}
#endif
template <typename scalar_t>
__device__ __forceinline__ scalar_t warpReduceSum(scalar_t val) {
for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2)
#ifdef MMCV_WITH_HIP
#ifdef HIP_DIFF
val += __shfl_down(val, offset);
#else
val += __shfl_down_sync(FULL_MASK, val, offset);
......@@ -55,11 +55,11 @@ __device__ __forceinline__ scalar_t warpReduceSum(scalar_t val) {
template <>
__device__ __forceinline__ phalf warpReduceSum(phalf val) {
for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2)
#ifdef MMCV_WITH_HIP
__PHALF(val) += __shfl_down(val, offset);
#ifdef HIP_DIFF
__PHALF(val) += __shfl_down(FULL_MASK, val, offset);
#else
__PHALF(val) +=
__shfl_down_sync(FULL_MASK, __PHALF(val).operator __half(), offset);
__shfl_down_sync(FULL_MASK, static_cast<__half>(__PHALF(val)), offset);
#endif
return val;
}
......@@ -316,7 +316,7 @@ __global__ void CARAFEBackward_Mask(const int num_kernels,
output_val += top_diff[top_id] * bottom_data[bottom_id];
}
}
#ifdef MMCV_WITH_HIP
#ifdef HIP_DIFF
__syncthreads();
#else
__syncwarp();
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment