Commit 6f3c5f1c authored by limm's avatar limm
Browse files

support v1.4.0

parent 6f674c7e
...@@ -2,8 +2,6 @@ ...@@ -2,8 +2,6 @@
# modified from # modified from
# https://github.com/Megvii-BaseDetection/cvpods/blob/master/cvpods/layers/border_align.py # https://github.com/Megvii-BaseDetection/cvpods/blob/master/cvpods/layers/border_align.py
from typing import Tuple
import torch import torch
import torch.nn as nn import torch.nn as nn
from torch.autograd import Function from torch.autograd import Function
...@@ -23,8 +21,7 @@ class BorderAlignFunction(Function): ...@@ -23,8 +21,7 @@ class BorderAlignFunction(Function):
'mmcv::MMCVBorderAlign', input, boxes, pool_size_i=pool_size) 'mmcv::MMCVBorderAlign', input, boxes, pool_size_i=pool_size)
@staticmethod @staticmethod
def forward(ctx, input: torch.Tensor, boxes: torch.Tensor, def forward(ctx, input, boxes, pool_size):
pool_size: int) -> torch.Tensor:
ctx.pool_size = pool_size ctx.pool_size = pool_size
ctx.input_shape = input.size() ctx.input_shape = input.size()
...@@ -48,8 +45,7 @@ class BorderAlignFunction(Function): ...@@ -48,8 +45,7 @@ class BorderAlignFunction(Function):
@staticmethod @staticmethod
@once_differentiable @once_differentiable
def backward(ctx, def backward(ctx, grad_output):
grad_output: torch.Tensor) -> Tuple[torch.Tensor, None, None]:
boxes, argmax_idx = ctx.saved_tensors boxes, argmax_idx = ctx.saved_tensors
grad_input = grad_output.new_zeros(ctx.input_shape) grad_input = grad_output.new_zeros(ctx.input_shape)
# complex head architecture may cause grad_output uncontiguous # complex head architecture may cause grad_output uncontiguous
...@@ -76,25 +72,24 @@ class BorderAlign(nn.Module): ...@@ -76,25 +72,24 @@ class BorderAlign(nn.Module):
For each border line (e.g. top, left, bottom or right) of each box, For each border line (e.g. top, left, bottom or right) of each box,
border_align does the following: border_align does the following:
1. uniformly samples `pool_size`+1 positions on this line, involving \
1. uniformly samples ``pool_size`` +1 positions on this line, involving the start and end points.
the start and end points. 2. the corresponding features on these points are computed by \
2. the corresponding features on these points are computed by bilinear bilinear interpolation.
interpolation. 3. max pooling over all the `pool_size`+1 positions are used for \
3. max pooling over all the ``pool_size`` +1 positions are used for computing pooled feature.
computing pooled feature.
Args: Args:
pool_size (int): number of positions sampled over the boxes' borders pool_size (int): number of positions sampled over the boxes' borders
(e.g. top, bottom, left, right). (e.g. top, bottom, left, right).
""" """
def __init__(self, pool_size: int): def __init__(self, pool_size):
super().__init__() super(BorderAlign, self).__init__()
self.pool_size = pool_size self.pool_size = pool_size
def forward(self, input: torch.Tensor, def forward(self, input, boxes):
boxes: torch.Tensor) -> torch.Tensor:
""" """
Args: Args:
input: Features with shape [N,4C,H,W]. Channels ranged in [0,C), input: Features with shape [N,4C,H,W]. Channels ranged in [0,C),
...@@ -103,8 +98,8 @@ class BorderAlign(nn.Module): ...@@ -103,8 +98,8 @@ class BorderAlign(nn.Module):
boxes: Boxes with shape [N,H*W,4]. Coordinate format (x1,y1,x2,y2). boxes: Boxes with shape [N,H*W,4]. Coordinate format (x1,y1,x2,y2).
Returns: Returns:
torch.Tensor: Pooled features with shape [N,C,H*W,4]. The order is Tensor: Pooled features with shape [N,C,H*W,4]. The order is
(top,left,bottom,right) for the last dimension. (top,left,bottom,right) for the last dimension.
""" """
return border_align(input, boxes, self.pool_size) return border_align(input, boxes, self.pool_size)
......
# Copyright (c) OpenMMLab. All rights reserved.
import torch
from ..utils import ext_loader
ext_module = ext_loader.load_ext('_ext', ['box_iou_quadri'])
def box_iou_quadri(bboxes1: torch.Tensor,
bboxes2: torch.Tensor,
mode: str = 'iou',
aligned: bool = False) -> torch.Tensor:
"""Return intersection-over-union (Jaccard index) of boxes.
Both sets of boxes are expected to be in
(x1, y1, ..., x4, y4) format.
If ``aligned`` is ``False``, then calculate the ious between each bbox
of bboxes1 and bboxes2, otherwise the ious between each aligned pair of
bboxes1 and bboxes2.
Args:
bboxes1 (torch.Tensor): quadrilateral bboxes 1. It has shape (N, 8),
indicating (x1, y1, ..., x4, y4) for each row.
bboxes2 (torch.Tensor): quadrilateral bboxes 2. It has shape (M, 8),
indicating (x1, y1, ..., x4, y4) for each row.
mode (str): "iou" (intersection over union) or iof (intersection over
foreground).
Returns:
torch.Tensor: Return the ious betweens boxes. If ``aligned`` is
``False``, the shape of ious is (N, M) else (N,).
"""
assert mode in ['iou', 'iof']
mode_dict = {'iou': 0, 'iof': 1}
mode_flag = mode_dict[mode]
rows = bboxes1.size(0)
cols = bboxes2.size(0)
if aligned:
ious = bboxes1.new_zeros(rows)
else:
ious = bboxes1.new_zeros(rows * cols)
bboxes1 = bboxes1.contiguous()
bboxes2 = bboxes2.contiguous()
ext_module.box_iou_quadri(
bboxes1, bboxes2, ious, mode_flag=mode_flag, aligned=aligned)
if not aligned:
ious = ious.view(rows, cols)
return ious
# Copyright (c) OpenMMLab. All rights reserved. # Copyright (c) OpenMMLab. All rights reserved.
import torch
from ..utils import ext_loader from ..utils import ext_loader
ext_module = ext_loader.load_ext('_ext', ['box_iou_rotated']) ext_module = ext_loader.load_ext('_ext', ['box_iou_rotated'])
def box_iou_rotated(bboxes1: torch.Tensor, def box_iou_rotated(bboxes1, bboxes2, mode='iou', aligned=False):
bboxes2: torch.Tensor,
mode: str = 'iou',
aligned: bool = False,
clockwise: bool = True) -> torch.Tensor:
"""Return intersection-over-union (Jaccard index) of boxes. """Return intersection-over-union (Jaccard index) of boxes.
Both sets of boxes are expected to be in Both sets of boxes are expected to be in
...@@ -20,110 +14,18 @@ def box_iou_rotated(bboxes1: torch.Tensor, ...@@ -20,110 +14,18 @@ def box_iou_rotated(bboxes1: torch.Tensor,
of bboxes1 and bboxes2, otherwise the ious between each aligned pair of of bboxes1 and bboxes2, otherwise the ious between each aligned pair of
bboxes1 and bboxes2. bboxes1 and bboxes2.
.. note:: Arguments:
The operator assumes: boxes1 (Tensor): rotated bboxes 1. \
It has shape (N, 5), indicating (x, y, w, h, theta) for each row.
1) The positive direction along x axis is left -> right. Note that theta is in radian.
boxes2 (Tensor): rotated bboxes 2. \
2) The positive direction along y axis is top -> down. It has shape (M, 5), indicating (x, y, w, h, theta) for each row.
Note that theta is in radian.
3) The w border is in parallel with x axis when angle = 0.
However, there are 2 opposite definitions of the positive angular
direction, clockwise (CW) and counter-clockwise (CCW). MMCV supports
both definitions and uses CW by default.
Please set ``clockwise=False`` if you are using the CCW definition.
The coordinate system when ``clockwise`` is ``True`` (default)
.. code-block:: none
0-------------------> x (0 rad)
| A-------------B
| | |
| | box h
| | angle=0 |
| D------w------C
v
y (pi/2 rad)
In such coordination system the rotation matrix is
.. math::
\\begin{pmatrix}
\\cos\\alpha & -\\sin\\alpha \\\\
\\sin\\alpha & \\cos\\alpha
\\end{pmatrix}
The coordinates of the corner point A can be calculated as:
.. math::
P_A=
\\begin{pmatrix} x_A \\\\ y_A\\end{pmatrix}
=
\\begin{pmatrix} x_{center} \\\\ y_{center}\\end{pmatrix} +
\\begin{pmatrix}\\cos\\alpha & -\\sin\\alpha \\\\
\\sin\\alpha & \\cos\\alpha\\end{pmatrix}
\\begin{pmatrix} -0.5w \\\\ -0.5h\\end{pmatrix} \\\\
=
\\begin{pmatrix} x_{center}-0.5w\\cos\\alpha+0.5h\\sin\\alpha
\\\\
y_{center}-0.5w\\sin\\alpha-0.5h\\cos\\alpha\\end{pmatrix}
The coordinate system when ``clockwise`` is ``False``
.. code-block:: none
0-------------------> x (0 rad)
| A-------------B
| | |
| | box h
| | angle=0 |
| D------w------C
v
y (-pi/2 rad)
In such coordination system the rotation matrix is
.. math::
\\begin{pmatrix}
\\cos\\alpha & \\sin\\alpha \\\\
-\\sin\\alpha & \\cos\\alpha
\\end{pmatrix}
The coordinates of the corner point A can be calculated as:
.. math::
P_A=
\\begin{pmatrix} x_A \\\\ y_A\\end{pmatrix}
=
\\begin{pmatrix} x_{center} \\\\ y_{center}\\end{pmatrix} +
\\begin{pmatrix}\\cos\\alpha & \\sin\\alpha \\\\
-\\sin\\alpha & \\cos\\alpha\\end{pmatrix}
\\begin{pmatrix} -0.5w \\\\ -0.5h\\end{pmatrix} \\\\
=
\\begin{pmatrix} x_{center}-0.5w\\cos\\alpha-0.5h\\sin\\alpha
\\\\
y_{center}+0.5w\\sin\\alpha-0.5h\\cos\\alpha\\end{pmatrix}
Args:
boxes1 (torch.Tensor): rotated bboxes 1. It has shape (N, 5),
indicating (x, y, w, h, theta) for each row. Note that theta is in
radian.
boxes2 (torch.Tensor): rotated bboxes 2. It has shape (M, 5),
indicating (x, y, w, h, theta) for each row. Note that theta is in
radian.
mode (str): "iou" (intersection over union) or iof (intersection over mode (str): "iou" (intersection over union) or iof (intersection over
foreground). foreground).
clockwise (bool): flag indicating whether the positive angular
orientation is clockwise. default True.
`New in version 1.4.3.`
Returns: Returns:
torch.Tensor: Return the ious betweens boxes. If ``aligned`` is ious(Tensor): shape (N, M) if aligned == False else shape (N,)
``False``, the shape of ious is (N, M) else (N,).
""" """
assert mode in ['iou', 'iof'] assert mode in ['iou', 'iof']
mode_dict = {'iou': 0, 'iof': 1} mode_dict = {'iou': 0, 'iof': 1}
...@@ -133,12 +35,7 @@ def box_iou_rotated(bboxes1: torch.Tensor, ...@@ -133,12 +35,7 @@ def box_iou_rotated(bboxes1: torch.Tensor,
if aligned: if aligned:
ious = bboxes1.new_zeros(rows) ious = bboxes1.new_zeros(rows)
else: else:
ious = bboxes1.new_zeros(rows * cols) ious = bboxes1.new_zeros((rows * cols))
if not clockwise:
flip_mat = bboxes1.new_ones(bboxes1.shape[-1])
flip_mat[-1] = -1
bboxes1 = bboxes1 * flip_mat
bboxes2 = bboxes2 * flip_mat
bboxes1 = bboxes1.contiguous() bboxes1 = bboxes1.contiguous()
bboxes2 = bboxes2.contiguous() bboxes2 = bboxes2.contiguous()
ext_module.box_iou_rotated( ext_module.box_iou_rotated(
......
# Copyright (c) OpenMMLab. All rights reserved. # Copyright (c) OpenMMLab. All rights reserved.
from typing import Tuple
import torch import torch
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
from mmengine.model import normal_init, xavier_init
from mmengine.registry import MODELS
from torch import Tensor
from torch.autograd import Function from torch.autograd import Function
from torch.nn.modules.module import Module from torch.nn.modules.module import Module
from ..cnn import UPSAMPLE_LAYERS, normal_init, xavier_init
from ..utils import ext_loader from ..utils import ext_loader
ext_module = ext_loader.load_ext('_ext', [ ext_module = ext_loader.load_ext('_ext', [
...@@ -21,8 +17,7 @@ ext_module = ext_loader.load_ext('_ext', [ ...@@ -21,8 +17,7 @@ ext_module = ext_loader.load_ext('_ext', [
class CARAFENaiveFunction(Function): class CARAFENaiveFunction(Function):
@staticmethod @staticmethod
def symbolic(g, features: Tensor, masks: Tensor, kernel_size: int, def symbolic(g, features, masks, kernel_size, group_size, scale_factor):
group_size: int, scale_factor: int) -> Tensor:
return g.op( return g.op(
'mmcv::MMCVCARAFENaive', 'mmcv::MMCVCARAFENaive',
features, features,
...@@ -32,8 +27,7 @@ class CARAFENaiveFunction(Function): ...@@ -32,8 +27,7 @@ class CARAFENaiveFunction(Function):
scale_factor_f=scale_factor) scale_factor_f=scale_factor)
@staticmethod @staticmethod
def forward(ctx, features: Tensor, masks: Tensor, kernel_size: int, def forward(ctx, features, masks, kernel_size, group_size, scale_factor):
group_size: int, scale_factor: int) -> Tensor:
assert scale_factor >= 1 assert scale_factor >= 1
assert masks.size(1) == kernel_size * kernel_size * group_size assert masks.size(1) == kernel_size * kernel_size * group_size
assert masks.size(-1) == features.size(-1) * scale_factor assert masks.size(-1) == features.size(-1) * scale_factor
...@@ -56,15 +50,12 @@ class CARAFENaiveFunction(Function): ...@@ -56,15 +50,12 @@ class CARAFENaiveFunction(Function):
group_size=group_size, group_size=group_size,
scale_factor=scale_factor) scale_factor=scale_factor)
if features.requires_grad or masks.requires_grad or \ if features.requires_grad or masks.requires_grad:
torch.__version__ == 'parrots':
ctx.save_for_backward(features, masks) ctx.save_for_backward(features, masks)
return output return output
@staticmethod @staticmethod
def backward( def backward(ctx, grad_output):
ctx,
grad_output: Tensor) -> Tuple[Tensor, Tensor, None, None, None]:
assert grad_output.is_cuda assert grad_output.is_cuda
features, masks = ctx.saved_tensors features, masks = ctx.saved_tensors
...@@ -92,8 +83,8 @@ carafe_naive = CARAFENaiveFunction.apply ...@@ -92,8 +83,8 @@ carafe_naive = CARAFENaiveFunction.apply
class CARAFENaive(Module): class CARAFENaive(Module):
def __init__(self, kernel_size: int, group_size: int, scale_factor: int): def __init__(self, kernel_size, group_size, scale_factor):
super().__init__() super(CARAFENaive, self).__init__()
assert isinstance(kernel_size, int) and isinstance( assert isinstance(kernel_size, int) and isinstance(
group_size, int) and isinstance(scale_factor, int) group_size, int) and isinstance(scale_factor, int)
...@@ -101,7 +92,7 @@ class CARAFENaive(Module): ...@@ -101,7 +92,7 @@ class CARAFENaive(Module):
self.group_size = group_size self.group_size = group_size
self.scale_factor = scale_factor self.scale_factor = scale_factor
def forward(self, features: Tensor, masks: Tensor) -> Tensor: def forward(self, features, masks):
return carafe_naive(features, masks, self.kernel_size, self.group_size, return carafe_naive(features, masks, self.kernel_size, self.group_size,
self.scale_factor) self.scale_factor)
...@@ -109,8 +100,7 @@ class CARAFENaive(Module): ...@@ -109,8 +100,7 @@ class CARAFENaive(Module):
class CARAFEFunction(Function): class CARAFEFunction(Function):
@staticmethod @staticmethod
def symbolic(g, features: Tensor, masks: Tensor, kernel_size: int, def symbolic(g, features, masks, kernel_size, group_size, scale_factor):
group_size: int, scale_factor: int) -> Tensor:
return g.op( return g.op(
'mmcv::MMCVCARAFE', 'mmcv::MMCVCARAFE',
features, features,
...@@ -120,8 +110,7 @@ class CARAFEFunction(Function): ...@@ -120,8 +110,7 @@ class CARAFEFunction(Function):
scale_factor_f=scale_factor) scale_factor_f=scale_factor)
@staticmethod @staticmethod
def forward(ctx, features: Tensor, masks: Tensor, kernel_size: int, def forward(ctx, features, masks, kernel_size, group_size, scale_factor):
group_size: int, scale_factor: int) -> Tensor:
assert scale_factor >= 1 assert scale_factor >= 1
assert masks.size(1) == kernel_size * kernel_size * group_size assert masks.size(1) == kernel_size * kernel_size * group_size
assert masks.size(-1) == features.size(-1) * scale_factor assert masks.size(-1) == features.size(-1) * scale_factor
...@@ -150,15 +139,14 @@ class CARAFEFunction(Function): ...@@ -150,15 +139,14 @@ class CARAFEFunction(Function):
group_size=group_size, group_size=group_size,
scale_factor=scale_factor) scale_factor=scale_factor)
if features.requires_grad or masks.requires_grad or \ if features.requires_grad or masks.requires_grad:
torch.__version__ == 'parrots':
ctx.save_for_backward(features, masks, rfeatures) ctx.save_for_backward(features, masks, rfeatures)
return output return output
@staticmethod @staticmethod
def backward( def backward(ctx, grad_output):
ctx, assert grad_output.is_cuda
grad_output: Tensor) -> Tuple[Tensor, Tensor, None, None, None]:
features, masks, rfeatures = ctx.saved_tensors features, masks, rfeatures = ctx.saved_tensors
kernel_size = ctx.kernel_size kernel_size = ctx.kernel_size
group_size = ctx.group_size group_size = ctx.group_size
...@@ -192,8 +180,7 @@ carafe = CARAFEFunction.apply ...@@ -192,8 +180,7 @@ carafe = CARAFEFunction.apply
class CARAFE(Module): class CARAFE(Module):
""" CARAFE: Content-Aware ReAssembly of FEatures """ CARAFE: Content-Aware ReAssembly of FEatures
Please refer to `CARAFE: Content-Aware ReAssembly of FEatures Please refer to https://arxiv.org/abs/1905.02188 for more details.
<https://arxiv.org/abs/1905.02188>`_ for more details.
Args: Args:
kernel_size (int): reassemble kernel size kernel_size (int): reassemble kernel size
...@@ -204,8 +191,8 @@ class CARAFE(Module): ...@@ -204,8 +191,8 @@ class CARAFE(Module):
upsampled feature map upsampled feature map
""" """
def __init__(self, kernel_size: int, group_size: int, scale_factor: int): def __init__(self, kernel_size, group_size, scale_factor):
super().__init__() super(CARAFE, self).__init__()
assert isinstance(kernel_size, int) and isinstance( assert isinstance(kernel_size, int) and isinstance(
group_size, int) and isinstance(scale_factor, int) group_size, int) and isinstance(scale_factor, int)
...@@ -213,19 +200,19 @@ class CARAFE(Module): ...@@ -213,19 +200,19 @@ class CARAFE(Module):
self.group_size = group_size self.group_size = group_size
self.scale_factor = scale_factor self.scale_factor = scale_factor
def forward(self, features: Tensor, masks: Tensor) -> Tensor: def forward(self, features, masks):
return carafe(features, masks, self.kernel_size, self.group_size, return carafe(features, masks, self.kernel_size, self.group_size,
self.scale_factor) self.scale_factor)
@MODELS.register_module(name='carafe') @UPSAMPLE_LAYERS.register_module(name='carafe')
class CARAFEPack(nn.Module): class CARAFEPack(nn.Module):
"""A unified package of CARAFE upsampler that contains: 1) channel """A unified package of CARAFE upsampler that contains: 1) channel
compressor 2) content encoder 3) CARAFE op. compressor 2) content encoder 3) CARAFE op.
Official implementation of ICCV 2019 paper Official implementation of ICCV 2019 paper
`CARAFE: Content-Aware ReAssembly of FEatures CARAFE: Content-Aware ReAssembly of FEatures
<https://arxiv.org/abs/1905.02188>`_. Please refer to https://arxiv.org/abs/1905.02188 for more details.
Args: Args:
channels (int): input feature channels channels (int): input feature channels
...@@ -241,14 +228,14 @@ class CARAFEPack(nn.Module): ...@@ -241,14 +228,14 @@ class CARAFEPack(nn.Module):
""" """
def __init__(self, def __init__(self,
channels: int, channels,
scale_factor: int, scale_factor,
up_kernel: int = 5, up_kernel=5,
up_group: int = 1, up_group=1,
encoder_kernel: int = 3, encoder_kernel=3,
encoder_dilation: int = 1, encoder_dilation=1,
compressed_channels: int = 64): compressed_channels=64):
super().__init__() super(CARAFEPack, self).__init__()
self.channels = channels self.channels = channels
self.scale_factor = scale_factor self.scale_factor = scale_factor
self.up_kernel = up_kernel self.up_kernel = up_kernel
...@@ -274,7 +261,7 @@ class CARAFEPack(nn.Module): ...@@ -274,7 +261,7 @@ class CARAFEPack(nn.Module):
xavier_init(m, distribution='uniform') xavier_init(m, distribution='uniform')
normal_init(self.content_encoder, std=0.001) normal_init(self.content_encoder, std=0.001)
def kernel_normalizer(self, mask: Tensor) -> Tensor: def kernel_normalizer(self, mask):
mask = F.pixel_shuffle(mask, self.scale_factor) mask = F.pixel_shuffle(mask, self.scale_factor)
n, mask_c, h, w = mask.size() n, mask_c, h, w = mask.size()
# use float division explicitly, # use float division explicitly,
...@@ -287,11 +274,11 @@ class CARAFEPack(nn.Module): ...@@ -287,11 +274,11 @@ class CARAFEPack(nn.Module):
return mask return mask
def feature_reassemble(self, x: Tensor, mask: Tensor) -> Tensor: def feature_reassemble(self, x, mask):
x = carafe(x, mask, self.up_kernel, self.up_group, self.scale_factor) x = carafe(x, mask, self.up_kernel, self.up_group, self.scale_factor)
return x return x
def forward(self, x: Tensor) -> Tensor: def forward(self, x):
compressed_x = self.channel_compressor(x) compressed_x = self.channel_compressor(x)
mask = self.content_encoder(compressed_x) mask = self.content_encoder(compressed_x)
mask = self.kernel_normalizer(mask) mask = self.kernel_normalizer(mask)
......
...@@ -2,12 +2,11 @@ ...@@ -2,12 +2,11 @@
import torch import torch
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
from mmengine.registry import MODELS
from mmcv.cnn import Scale from mmcv.cnn import PLUGIN_LAYERS, Scale
def NEG_INF_DIAG(n: int, device: torch.device) -> torch.Tensor: def NEG_INF_DIAG(n, device):
"""Returns a diagonal matrix of size [n, n]. """Returns a diagonal matrix of size [n, n].
The diagonal are all "-inf". This is for avoiding calculating the The diagonal are all "-inf". This is for avoiding calculating the
...@@ -16,7 +15,7 @@ def NEG_INF_DIAG(n: int, device: torch.device) -> torch.Tensor: ...@@ -16,7 +15,7 @@ def NEG_INF_DIAG(n: int, device: torch.device) -> torch.Tensor:
return torch.diag(torch.tensor(float('-inf')).to(device).repeat(n), 0) return torch.diag(torch.tensor(float('-inf')).to(device).repeat(n), 0)
@MODELS.register_module() @PLUGIN_LAYERS.register_module()
class CrissCrossAttention(nn.Module): class CrissCrossAttention(nn.Module):
"""Criss-Cross Attention Module. """Criss-Cross Attention Module.
...@@ -42,7 +41,7 @@ class CrissCrossAttention(nn.Module): ...@@ -42,7 +41,7 @@ class CrissCrossAttention(nn.Module):
in_channels (int): Channels of the input feature map. in_channels (int): Channels of the input feature map.
""" """
def __init__(self, in_channels: int) -> None: def __init__(self, in_channels):
super().__init__() super().__init__()
self.query_conv = nn.Conv2d(in_channels, in_channels // 8, 1) self.query_conv = nn.Conv2d(in_channels, in_channels // 8, 1)
self.key_conv = nn.Conv2d(in_channels, in_channels // 8, 1) self.key_conv = nn.Conv2d(in_channels, in_channels // 8, 1)
...@@ -50,15 +49,14 @@ class CrissCrossAttention(nn.Module): ...@@ -50,15 +49,14 @@ class CrissCrossAttention(nn.Module):
self.gamma = Scale(0.) self.gamma = Scale(0.)
self.in_channels = in_channels self.in_channels = in_channels
def forward(self, x: torch.Tensor) -> torch.Tensor: def forward(self, x):
"""forward function of Criss-Cross Attention. """forward function of Criss-Cross Attention.
Args: Args:
x (torch.Tensor): Input feature with the shape of x (Tensor): Input feature. \
(batch_size, in_channels, height, width). shape (batch_size, in_channels, height, width)
Returns: Returns:
torch.Tensor: Output of the layer, with the shape of Tensor: Output of the layer, with shape of \
(batch_size, in_channels, height, width) (batch_size, in_channels, height, width)
""" """
B, C, H, W = x.size() B, C, H, W = x.size()
...@@ -79,7 +77,7 @@ class CrissCrossAttention(nn.Module): ...@@ -79,7 +77,7 @@ class CrissCrossAttention(nn.Module):
return out return out
def __repr__(self) -> str: def __repr__(self):
s = self.__class__.__name__ s = self.__class__.__name__
s += f'(in_channels={self.in_channels})' s += f'(in_channels={self.in_channels})'
return s return s
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Sequence, Tuple
import torch
from torch import Tensor
from torch.autograd import Function
from torch.autograd.function import once_differentiable
from ..utils import ext_loader
ext_module = ext_loader.load_ext(
'_ext', ['chamfer_distance_forward', 'chamfer_distance_backward'])
class ChamferDistanceFunction(Function):
"""This is an implementation of the 2D Chamfer Distance.
It has been used in the paper `Oriented RepPoints for Aerial Object
Detection (CVPR 2022) <https://arxiv.org/abs/2105.11111>_`.
"""
@staticmethod
def forward(ctx, xyz1: Tensor, xyz2: Tensor) -> Sequence[Tensor]:
"""
Args:
xyz1 (Tensor): Point set with shape (B, N, 2).
xyz2 (Tensor): Point set with shape (B, N, 2).
Returns:
Sequence[Tensor]:
- dist1 (Tensor): Chamfer distance (xyz1 to xyz2) with
shape (B, N).
- dist2 (Tensor): Chamfer distance (xyz2 to xyz1) with
shape (B, N).
- idx1 (Tensor): Index of chamfer distance (xyz1 to xyz2)
with shape (B, N), which be used in compute gradient.
- idx2 (Tensor): Index of chamfer distance (xyz2 to xyz2)
with shape (B, N), which be used in compute gradient.
"""
batch_size, n, _ = xyz1.size()
_, m, _ = xyz2.size()
device = xyz1.device
xyz1 = xyz1.contiguous()
xyz2 = xyz2.contiguous()
dist1 = torch.zeros(batch_size, n).to(device)
dist2 = torch.zeros(batch_size, m).to(device)
idx1 = torch.zeros(batch_size, n).type(torch.IntTensor).to(device)
idx2 = torch.zeros(batch_size, m).type(torch.IntTensor).to(device)
ext_module.chamfer_distance_forward(xyz1, xyz2, dist1, dist2, idx1,
idx2)
ctx.save_for_backward(xyz1, xyz2, idx1, idx2)
return dist1, dist2, idx1, idx2
@staticmethod
@once_differentiable
def backward(ctx,
grad_dist1: Tensor,
grad_dist2: Tensor,
grad_idx1=None,
grad_idx2=None) -> Tuple[Tensor, Tensor]:
"""
Args:
grad_dist1 (Tensor): Gradient of chamfer distance
(xyz1 to xyz2) with shape (B, N).
grad_dist2 (Tensor): Gradient of chamfer distance
(xyz2 to xyz1) with shape (B, N).
Returns:
Tuple[Tensor, Tensor]:
- grad_xyz1 (Tensor): Gradient of the point set with shape \
(B, N, 2).
- grad_xyz2 (Tensor):Gradient of the point set with shape \
(B, N, 2).
"""
xyz1, xyz2, idx1, idx2 = ctx.saved_tensors
device = grad_dist1.device
grad_dist1 = grad_dist1.contiguous()
grad_dist2 = grad_dist2.contiguous()
grad_xyz1 = torch.zeros(xyz1.size()).to(device)
grad_xyz2 = torch.zeros(xyz2.size()).to(device)
ext_module.chamfer_distance_backward(xyz1, xyz2, idx1, idx2,
grad_dist1, grad_dist2, grad_xyz1,
grad_xyz2)
return grad_xyz1, grad_xyz2
chamfer_distance = ChamferDistanceFunction.apply
# Copyright (c) OpenMMLab. All rights reserved. # Copyright (c) OpenMMLab. All rights reserved.
from typing import Union
import numpy as np import numpy as np
import torch import torch
...@@ -9,22 +7,21 @@ from ..utils import ext_loader ...@@ -9,22 +7,21 @@ from ..utils import ext_loader
ext_module = ext_loader.load_ext('_ext', ['contour_expand']) ext_module = ext_loader.load_ext('_ext', ['contour_expand'])
def contour_expand(kernel_mask: Union[np.array, torch.Tensor], def contour_expand(kernel_mask, internal_kernel_label, min_kernel_area,
internal_kernel_label: Union[np.array, torch.Tensor], kernel_num):
min_kernel_area: int, kernel_num: int) -> list:
"""Expand kernel contours so that foreground pixels are assigned into """Expand kernel contours so that foreground pixels are assigned into
instances. instances.
Args: Arguments:
kernel_mask (np.array or torch.Tensor): The instance kernel mask with kernel_mask (np.array or Tensor): The instance kernel mask with
size hxw. size hxw.
internal_kernel_label (np.array or torch.Tensor): The instance internal internal_kernel_label (np.array or Tensor): The instance internal
kernel label with size hxw. kernel label with size hxw.
min_kernel_area (int): The minimum kernel area. min_kernel_area (int): The minimum kernel area.
kernel_num (int): The instance kernel number. kernel_num (int): The instance kernel number.
Returns: Returns:
list: The instance index map with size hxw. label (list): The instance index map with size hxw.
""" """
assert isinstance(kernel_mask, (torch.Tensor, np.ndarray)) assert isinstance(kernel_mask, (torch.Tensor, np.ndarray))
assert isinstance(internal_kernel_label, (torch.Tensor, np.ndarray)) assert isinstance(internal_kernel_label, (torch.Tensor, np.ndarray))
...@@ -45,7 +42,7 @@ def contour_expand(kernel_mask: Union[np.array, torch.Tensor], ...@@ -45,7 +42,7 @@ def contour_expand(kernel_mask: Union[np.array, torch.Tensor],
internal_kernel_label, internal_kernel_label,
min_kernel_area=min_kernel_area, min_kernel_area=min_kernel_area,
kernel_num=kernel_num) kernel_num=kernel_num)
label = label.tolist() # type: ignore label = label.tolist()
else: else:
label = ext_module.contour_expand(kernel_mask, internal_kernel_label, label = ext_module.contour_expand(kernel_mask, internal_kernel_label,
min_kernel_area, kernel_num) min_kernel_area, kernel_num)
......
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# NVIDIA CORPORATION and its licensors retain all intellectual property
# and proprietary rights in and to this software, related documentation
# and any modifications thereto. Any use, reproduction, disclosure or
# distribution of this software and related documentation without an express
# license agreement from NVIDIA CORPORATION is strictly prohibited.
# source: https://github.com/NVlabs/stylegan3/blob/main/torch_utils/ops/conv2d_gradfix.py # noqa
"""Custom replacement for `torch.nn.functional.conv2d` that supports
arbitrarily high order gradients with zero performance penalty."""
import contextlib
import warnings
from typing import Dict, Optional, Tuple, Union
import torch
from mmengine.utils import digit_version
enabled = True
weight_gradients_disabled = False
@contextlib.contextmanager
def no_weight_gradients(disable=True):
global weight_gradients_disabled
old = weight_gradients_disabled
if disable:
weight_gradients_disabled = True
yield
weight_gradients_disabled = old
def conv2d(input: torch.Tensor,
weight: torch.Tensor,
bias: Optional[torch.Tensor] = None,
stride: Union[int, Tuple[int, ...]] = 1,
padding: Union[int, Tuple[int, ...]] = 0,
dilation: Union[int, Tuple[int, ...]] = 1,
groups: int = 1):
flag = True
if digit_version(torch.__version__) >= digit_version('1.10.0'):
warnings.warn('Since '
'aten:cudnn_convolution_backward_weight is '
f'not supported in torch=={torch.__version__},'
' rolling back to `torch.nn.functional.conv2d`')
flag = False
if _should_use_custom_op(input) and flag:
return _conv2d_gradfix(
transpose=False,
weight_shape=weight.shape,
stride=stride,
padding=padding,
output_padding=0,
dilation=dilation,
groups=groups).apply(input, weight, bias)
return torch.nn.functional.conv2d(
input=input,
weight=weight,
bias=bias,
stride=stride,
padding=padding,
dilation=dilation,
groups=groups)
def conv_transpose2d(input: torch.Tensor,
weight: torch.Tensor,
bias: Optional[torch.Tensor] = None,
stride: Union[int, Tuple[int, ...]] = 1,
padding: Union[int, Tuple[int, ...]] = 0,
output_padding: Union[int, Tuple[int, ...]] = 0,
groups: int = 1,
dilation: Union[int, Tuple[int, ...]] = 1):
if _should_use_custom_op(input):
return _conv2d_gradfix(
transpose=True,
weight_shape=weight.shape,
stride=stride,
padding=padding,
output_padding=output_padding,
groups=groups,
dilation=dilation).apply(input, weight, bias)
return torch.nn.functional.conv_transpose2d(
input=input,
weight=weight,
bias=bias,
stride=stride,
padding=padding,
output_padding=output_padding,
groups=groups,
dilation=dilation)
def _should_use_custom_op(input):
assert isinstance(input, torch.Tensor)
if (not enabled) or (not torch.backends.cudnn.enabled):
return False
if input.device.type != 'cuda':
return False
return True
def _to_tuple(x, ndim):
xs = tuple(x) if isinstance(x, (tuple, list)) else (x, ) * ndim
assert len(xs) == ndim
assert all(isinstance(x, int) for x in xs)
return xs
_conv2d_gradfix_cache: Dict = dict()
_null_tensor = torch.empty([0])
def _conv2d_gradfix(
transpose: bool,
weight_shape: Tuple[int, ...],
stride: Union[int, Tuple[int, ...]],
padding: Union[int, Tuple[int, ...]],
output_padding: Union[int, Tuple[int, ...]],
dilation: Union[int, Tuple[int, ...]],
groups: int,
):
# Parse arguments.
ndim = 2
weight_shape = tuple(weight_shape)
stride = _to_tuple(stride, ndim)
padding = _to_tuple(padding, ndim)
output_padding = _to_tuple(output_padding, ndim)
dilation = _to_tuple(dilation, ndim)
# Lookup from cache.
key = (transpose, weight_shape, stride, padding, output_padding, dilation,
groups)
if key in _conv2d_gradfix_cache:
return _conv2d_gradfix_cache[key]
# Validate arguments.
assert groups >= 1
assert len(weight_shape) == ndim + 2
assert all(stride[i] >= 1 for i in range(ndim)) # type: ignore
assert all(padding[i] >= 0 for i in range(ndim)) # type: ignore
assert all(dilation[i] >= 0 for i in range(ndim)) # type: ignore
if not transpose:
assert all(output_padding[i] == 0 for i in range(ndim)) # type: ignore
else: # transpose
for i in range(ndim):
assert 0 <= output_padding[i] < max( # type: ignore
stride[i], # type: ignore
dilation[i]) # type: ignore
# Helpers.
common_kwargs = dict(
stride=stride, padding=padding, dilation=dilation, groups=groups)
def calc_output_padding(input_shape, output_shape):
if transpose:
return [0, 0]
return [
input_shape[i + 2] - (output_shape[i + 2] - 1) * stride[i] -
(1 - 2 * padding[i]) - dilation[i] * (weight_shape[i + 2] - 1)
for i in range(ndim)
]
# Forward & backward.
class Conv2d(torch.autograd.Function):
@staticmethod
def forward(ctx, input, weight, bias):
assert weight.shape == weight_shape
ctx.save_for_backward(
input if weight.requires_grad else _null_tensor,
weight if input.requires_grad else _null_tensor,
)
ctx.input_shape = input.shape
# Simple 1x1 convolution => cuBLAS (only on Volta, not on Ampere).
if weight_shape[2:] == stride == dilation == (
1, 1) and padding == (
0, 0) and torch.cuda.get_device_capability(
input.device) < (8, 0):
a = weight.reshape(groups, weight_shape[0] // groups,
weight_shape[1])
b = input.reshape(input.shape[0], groups,
input.shape[1] // groups, -1)
c = (a.transpose(1, 2) if transpose else a) @ b.permute(
1, 2, 0, 3).flatten(2)
c = c.reshape(-1, input.shape[0],
*input.shape[2:]).transpose(0, 1)
c = c if bias is None else c + bias.unsqueeze(0).unsqueeze(
2).unsqueeze(3)
return c.contiguous(
memory_format=(torch.channels_last if input.stride(1) ==
1 else torch.contiguous_format))
# General case => cuDNN.
if transpose:
return torch.nn.functional.conv_transpose2d(
input=input,
weight=weight,
bias=bias,
output_padding=output_padding,
**common_kwargs)
return torch.nn.functional.conv2d(
input=input, weight=weight, bias=bias, **common_kwargs)
@staticmethod
def backward(ctx, grad_output):
input, weight = ctx.saved_tensors
input_shape = ctx.input_shape
grad_input = None
grad_weight = None
grad_bias = None
if ctx.needs_input_grad[0]:
p = calc_output_padding(
input_shape=input_shape, output_shape=grad_output.shape)
op = _conv2d_gradfix(
transpose=(not transpose),
weight_shape=weight_shape,
output_padding=p,
**common_kwargs)
grad_input = op.apply(grad_output, weight, None)
assert grad_input.shape == input_shape
if ctx.needs_input_grad[1] and not weight_gradients_disabled:
grad_weight = Conv2dGradWeight.apply(grad_output, input)
assert grad_weight.shape == weight_shape
if ctx.needs_input_grad[2]:
grad_bias = grad_output.sum([0, 2, 3])
return grad_input, grad_weight, grad_bias
# Gradient with respect to the weights.
class Conv2dGradWeight(torch.autograd.Function):
@staticmethod
def forward(ctx, grad_output, input):
ctx.save_for_backward(
grad_output if input.requires_grad else _null_tensor,
input if grad_output.requires_grad else _null_tensor,
)
ctx.grad_output_shape = grad_output.shape
ctx.input_shape = input.shape
# Simple 1x1 convolution => cuBLAS (on both Volta and Ampere).
if weight_shape[2:] == stride == dilation == (
1, 1) and padding == (0, 0):
a = grad_output.reshape(grad_output.shape[0], groups,
grad_output.shape[1] // groups,
-1).permute(1, 2, 0, 3).flatten(2)
b = input.reshape(input.shape[0], groups,
input.shape[1] // groups,
-1).permute(1, 2, 0, 3).flatten(2)
c = (b @ a.transpose(1, 2) if transpose else
a @ b.transpose(1, 2)).reshape(weight_shape)
return c.contiguous(
memory_format=(torch.channels_last if input.stride(1) ==
1 else torch.contiguous_format))
# PyTorch consolidated convolution backward API in PR:
# https://github.com/pytorch/pytorch/commit/3dc3651e0ee3623f669c3a2c096408dbc476d122 # noqa: E501
# Enhance the code referring to the discussion:
# https://github.com/pytorch/pytorch/issues/74437
if digit_version(torch.__version__) >= digit_version('1.11.0'):
empty_weight = torch.tensor(
0.0, dtype=input.dtype,
device=input.device).expand(weight_shape)
output_padding = calc_output_padding(input.shape,
grad_output.shape)
return torch.ops.aten.convolution_backward(
grad_output,
input,
empty_weight,
None,
stride=stride,
dilation=dilation,
transposed=transpose,
padding=padding,
groups=groups,
output_padding=output_padding,
output_mask=[0, 1, 0])[1]
else:
is_rocm_pytorch = False
try:
from torch.utils.cpp_extension import ROCM_HOME
is_rocm_pytorch = True if ((torch.version.hip is not None) and
(ROCM_HOME is not None)) else False
except ImportError:
pass
name=''
flags=[]
if is_rocm_pytorch:
name = ('aten::miopen_convolution_transpose_backward_weight'
if transpose else
'aten::miopen_convolution_backward_weight')
flags = [
torch.backends.cudnn.benchmark,
torch.backends.cudnn.deterministic
]
else:
# General case => cuDNN.
name = ('aten::cudnn_convolution_transpose_backward_weight'
if transpose else
'aten::cudnn_convolution_backward_weight')
flags = [
torch.backends.cudnn.benchmark,
torch.backends.cudnn.deterministic,
torch.backends.cudnn.allow_tf32
]
return torch._C._jit_get_operation(name)(weight_shape,
grad_output, input,
padding, stride,
dilation, groups,
*flags)
@staticmethod
def backward(ctx, grad2_grad_weight):
grad_output, input = ctx.saved_tensors
grad_output_shape = ctx.grad_output_shape
input_shape = ctx.input_shape
grad2_grad_output = None
grad2_input = None
if ctx.needs_input_grad[0]:
grad2_grad_output = Conv2d.apply(input, grad2_grad_weight,
None)
assert grad2_grad_output.shape == grad_output_shape
if ctx.needs_input_grad[1]:
p = calc_output_padding(
input_shape=input_shape, output_shape=grad_output_shape)
op = _conv2d_gradfix(
transpose=(not transpose),
weight_shape=weight_shape,
output_padding=p,
**common_kwargs)
grad2_input = op.apply(grad_output, grad2_grad_weight, None)
assert grad2_input.shape == input_shape
return grad2_grad_output, grad2_input
_conv2d_gradfix_cache[key] = Conv2d
return Conv2d
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Tuple
import torch
from ..utils import ext_loader
ext_module = ext_loader.load_ext('_ext', ['convex_iou', 'convex_giou'])
def convex_giou(pointsets: torch.Tensor,
polygons: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
"""Return generalized intersection-over-union (Jaccard index) between point
sets and polygons.
Args:
pointsets (torch.Tensor): It has shape (N, 18),
indicating (x1, y1, x2, y2, ..., x9, y9) for each row.
polygons (torch.Tensor): It has shape (N, 8),
indicating (x1, y1, x2, y2, x3, y3, x4, y4) for each row.
Returns:
tuple[torch.Tensor, torch.Tensor]: The first element is the gious
between point sets and polygons with the shape (N,). The second
element is the gradient of point sets with the shape (N, 18).
"""
output = pointsets.new_zeros((pointsets.size(0), 19))
ext_module.convex_giou(pointsets, polygons, output)
convex_giou = output[:, -1]
points_grad = output[:, 0:-1]
return convex_giou, points_grad
def convex_iou(pointsets: torch.Tensor,
polygons: torch.Tensor) -> torch.Tensor:
"""Return intersection-over-union (Jaccard index) between point sets and
polygons.
Args:
pointsets (torch.Tensor): It has shape (N, 18),
indicating (x1, y1, x2, y2, ..., x9, y9) for each row.
polygons (torch.Tensor): It has shape (K, 8),
indicating (x1, y1, x2, y2, x3, y3, x4, y4) for each row.
Returns:
torch.Tensor: Return the ious between point sets and polygons with the
shape (N, K).
"""
N, K = pointsets.size(0), polygons.size(0)
ious = pointsets.new_zeros((N, K))
ext_module.convex_iou(pointsets, polygons, ious)
return ious
# Copyright (c) OpenMMLab. All rights reserved. # Copyright (c) OpenMMLab. All rights reserved.
import torch import torch
from torch import Tensor, nn from torch import nn
from mmengine.utils import digit_version from torch.autograd import Function
from ..utils import ext_loader
ext_module = ext_loader.load_ext('_ext', [
'top_pool_forward', 'top_pool_backward', 'bottom_pool_forward',
'bottom_pool_backward', 'left_pool_forward', 'left_pool_backward',
'right_pool_forward', 'right_pool_backward'
])
_mode_dict = {'top': 0, 'bottom': 1, 'left': 2, 'right': 3} _mode_dict = {'top': 0, 'bottom': 1, 'left': 2, 'right': 3}
def _corner_pool(x: Tensor, dim: int, flip: bool) -> Tensor: class TopPoolFunction(Function):
size = x.size(dim)
output = x.clone()
ind = 1 @staticmethod
while ind < size: def symbolic(g, input):
if flip: output = g.op(
cur_start = 0 'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['top']))
cur_len = size - ind return output
next_start = ind
next_len = size - ind @staticmethod
else: def forward(ctx, input):
cur_start = ind output = ext_module.top_pool_forward(input)
cur_len = size - ind ctx.save_for_backward(input)
next_start = 0 return output
next_len = size - ind
@staticmethod
def backward(ctx, grad_output):
input, = ctx.saved_tensors
output = ext_module.top_pool_backward(input, grad_output)
return output
class BottomPoolFunction(Function):
@staticmethod
def symbolic(g, input):
output = g.op(
'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['bottom']))
return output
@staticmethod
def forward(ctx, input):
output = ext_module.bottom_pool_forward(input)
ctx.save_for_backward(input)
return output
@staticmethod
def backward(ctx, grad_output):
input, = ctx.saved_tensors
output = ext_module.bottom_pool_backward(input, grad_output)
return output
# max_temp should be cloned for backward computation
max_temp = output.narrow(dim, cur_start, cur_len).clone()
cur_temp = output.narrow(dim, cur_start, cur_len)
next_temp = output.narrow(dim, next_start, next_len)
cur_temp[...] = torch.where(max_temp > next_temp, max_temp, next_temp) class LeftPoolFunction(Function):
ind = ind << 1 @staticmethod
def symbolic(g, input):
output = g.op(
'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['left']))
return output
return output @staticmethod
def forward(ctx, input):
output = ext_module.left_pool_forward(input)
ctx.save_for_backward(input)
return output
@staticmethod
def backward(ctx, grad_output):
input, = ctx.saved_tensors
output = ext_module.left_pool_backward(input, grad_output)
return output
class RightPoolFunction(Function):
@staticmethod
def symbolic(g, input):
output = g.op(
'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['right']))
return output
@staticmethod
def forward(ctx, input):
output = ext_module.right_pool_forward(input)
ctx.save_for_backward(input)
return output
@staticmethod
def backward(ctx, grad_output):
input, = ctx.saved_tensors
output = ext_module.right_pool_backward(input, grad_output)
return output
class CornerPool(nn.Module): class CornerPool(nn.Module):
...@@ -40,13 +104,11 @@ class CornerPool(nn.Module): ...@@ -40,13 +104,11 @@ class CornerPool(nn.Module):
Corner Pooling is a new type of pooling layer that helps a Corner Pooling is a new type of pooling layer that helps a
convolutional network better localize corners of bounding boxes. convolutional network better localize corners of bounding boxes.
Please refer to `CornerNet: Detecting Objects as Paired Keypoints Please refer to https://arxiv.org/abs/1808.01244 for more details.
<https://arxiv.org/abs/1808.01244>`_ for more details.
Code is modified from https://github.com/princeton-vl/CornerNet-Lite. Code is modified from https://github.com/princeton-vl/CornerNet-Lite.
Args: Args:
mode (str): Pooling orientation for the pooling layer mode(str): Pooling orientation for the pooling layer
- 'bottom': Bottom Pooling - 'bottom': Bottom Pooling
- 'left': Left Pooling - 'left': Left Pooling
...@@ -57,6 +119,13 @@ class CornerPool(nn.Module): ...@@ -57,6 +119,13 @@ class CornerPool(nn.Module):
Feature map after pooling. Feature map after pooling.
""" """
pool_functions = {
'bottom': BottomPoolFunction,
'left': LeftPoolFunction,
'right': RightPoolFunction,
'top': TopPoolFunction,
}
cummax_dim_flip = { cummax_dim_flip = {
'bottom': (2, False), 'bottom': (2, False),
'left': (3, True), 'left': (3, True),
...@@ -64,13 +133,23 @@ class CornerPool(nn.Module): ...@@ -64,13 +133,23 @@ class CornerPool(nn.Module):
'top': (2, True), 'top': (2, True),
} }
def __init__(self, mode: str): def __init__(self, mode):
super().__init__() super(CornerPool, self).__init__()
assert mode in self.cummax_dim_flip assert mode in self.pool_functions
self.mode = mode self.mode = mode
self.corner_pool = self.pool_functions[mode]
def forward(self, x):
if torch.__version__ != 'parrots' and torch.__version__ >= '1.5.0':
if torch.onnx.is_in_onnx_export():
assert torch.__version__ >= '1.7.0', \
'When `cummax` serves as an intermediate component whose '\
'outputs is used as inputs for another modules, it\'s '\
'expected that pytorch version must be >= 1.7.0, '\
'otherwise Error appears like: `RuntimeError: tuple '\
'appears in op that does not forward tuples, unsupported '\
'kind: prim::PythonOp`.'
def forward(self, x: Tensor) -> Tensor:
if torch.__version__ != 'parrots' and digit_version(torch.__version__) >= digit_version('1.5.0'):
dim, flip = self.cummax_dim_flip[self.mode] dim, flip = self.cummax_dim_flip[self.mode]
if flip: if flip:
x = x.flip(dim) x = x.flip(dim)
...@@ -79,5 +158,4 @@ class CornerPool(nn.Module): ...@@ -79,5 +158,4 @@ class CornerPool(nn.Module):
pool_tensor = pool_tensor.flip(dim) pool_tensor = pool_tensor.flip(dim)
return pool_tensor return pool_tensor
else: else:
dim, flip = self.cummax_dim_flip[self.mode] return self.corner_pool.apply(x)
return _corner_pool(x, dim, flip)
# Copyright (c) OpenMMLab. All rights reserved. # Copyright (c) OpenMMLab. All rights reserved.
from typing import Tuple
import torch import torch
from torch import Tensor, nn from torch import Tensor, nn
from torch.autograd import Function from torch.autograd import Function
...@@ -17,14 +15,14 @@ class CorrelationFunction(Function): ...@@ -17,14 +15,14 @@ class CorrelationFunction(Function):
@staticmethod @staticmethod
def forward(ctx, def forward(ctx,
input1: Tensor, input1,
input2: Tensor, input2,
kernel_size: int = 1, kernel_size=1,
max_displacement: int = 1, max_displacement=1,
stride: int = 1, stride=1,
padding: int = 1, padding=1,
dilation: int = 1, dilation=1,
dilation_patch: int = 1) -> Tensor: dilation_patch=1):
ctx.save_for_backward(input1, input2) ctx.save_for_backward(input1, input2)
...@@ -62,9 +60,7 @@ class CorrelationFunction(Function): ...@@ -62,9 +60,7 @@ class CorrelationFunction(Function):
@staticmethod @staticmethod
@once_differentiable @once_differentiable
def backward( def backward(ctx, grad_output):
ctx, grad_output: Tensor
) -> Tuple[Tensor, Tensor, None, None, None, None, None, None]:
input1, input2 = ctx.saved_tensors input1, input2 = ctx.saved_tensors
kH, kW = ctx.kernel_size kH, kW = ctx.kernel_size
......
...@@ -13,150 +13,158 @@ This folder contains all non-python code for MMCV custom ops. Please follow the ...@@ -13,150 +13,158 @@ This folder contains all non-python code for MMCV custom ops. Please follow the
│ ├── pytorch_cpp_helper.hpp │ ├── pytorch_cpp_helper.hpp
│ ├── pytorch_cuda_helper.hpp │ ├── pytorch_cuda_helper.hpp
│ ├── pytorch_device_registry.hpp │ ├── pytorch_device_registry.hpp
│   ├── cuda │   └── cuda
│   │ ├── common_cuda_helper.hpp │   ├── common_cuda_helper.hpp
│   │ ├── parrots_cudawarpfunction.cuh │   ├── parrots_cudawarpfunction.cuh
│   │ ├── ... │   ├── ...
│   │ └── ops_cuda_kernel.cuh │   └── ops_cuda_kernel.cuh
|   ├── mps ├── onnxruntime
│   │ ├── MPSLibrary.h │   ├── onnxruntime_register.h
│   │ ├── ... │   ├── onnxruntime_session_options_config_keys.h
│   │ └── MPSUtils.h │   ├── ort_mmcv_utils.h
|   ├── mlu │   ├── ...
│   │ └── ... │   ├── onnx_ops.h
|   └── utils │   └── cpu
│   │ └── ... │ ├── onnxruntime_register.cpp
│      ├── ...
│      └── onnx_ops_impl.cpp
├── parrots ├── parrots
│   ├── ... │   ├── ...
│   ├── ops.cpp │   ├── ops.cpp
│   ├── ops_parrots.cpp │   ├── ops_parrots.cpp
│   └── ops_pytorch.h │   └── ops_pytorch.h
└── pytorch ├── pytorch
    ├── info.cpp │   ├── info.cpp
    ├── pybind.cpp │   ├── pybind.cpp
    ├── ... │   ├── ...
    ├── ops.cpp │   ├── ops.cpp
    ├── cuda │   ├── cuda
    │   ├── ... │   │   ├── ...
    │   └── ops_cuda.cu │   │   └── ops_cuda.cu
    ├── cpu │   └── cpu
    │   ├── ... │      ├── ...
    │   └── ops.cpp │      └── ops.cpp
    ├── mps └── tensorrt
    │   ├── ... ├── trt_cuda_helper.cuh
    |   └── op_mps.mm ├── trt_plugin_helper.hpp
    └── mlu ├── trt_plugin.hpp
       ├── ... ├── trt_serialize.hpp
       └── op_mlu.cpp ├── ...
├── trt_ops.hpp
└── plugins
   ├── trt_cuda_helper.cu
   ├── trt_plugin.cpp
   ├── ...
   ├── trt_ops.cpp
   └── trt_ops_kernel.cu
``` ```
## Components ## Components
- `common`: This directory contains all tools and shared codes. - `common`: This directory contains all tools and shared codes.
- `cuda`: The cuda kernels which can be shared by all backends. **HIP** kernel is also here since they have similar syntax. - `cuda`: The cuda kernels which can be shared by all backends. **HIP** kernel is also here since they have similar syntax.
- `mps`: The tools used to support MPS ops. **NOTE** that MPS support is **experimental**. - `onnxruntime`: **ONNX Runtime** support for custom ops.
- `mlu`: The MLU kernels used to support [Cambricon](https://www.cambricon.com/) device. - `cpu`: CPU implementation of supported ops.
- `utils`: The kernels and utils of spconv.
- `parrots`: **Parrots** is a deep learning frame for model training and inference. Parrots custom ops are placed in this directory. - `parrots`: **Parrots** is a deep learning frame for model training and inference. Parrots custom ops are placed in this directory.
- `pytorch`: **PyTorch** custom ops are supported by binding C++ to Python with **pybind11**. The ops implementation and binding codes are placed in this directory. - `pytorch`: **PyTorch** custom ops are supported by binding C++ to Python with **pybind11**. The ops implementation and binding codes are placed in this directory.
- `cuda`: This directory contains cuda kernel launchers, which feed memory pointers of tensor to the cuda kernel in `common/cuda`. The launchers provide c++ interface of cuda implementation of corresponding custom ops. - `cuda`: This directory contains cuda kernel launchers, which feed memory pointers of tensor to the cuda kernel in `common/cuda`. The launchers provide c++ interface of cuda implementation of corresponding custom ops.
- `cpu`: This directory contain cpu implementations of corresponding custom ops. - `cpu`: This directory contain cpu implementations of corresponding custom ops.
- `mlu`: This directory contain launchers of each MLU kernels. - `tensorrt`: **TensorRT** support for custom ops.
- `mps`: MPS ops implementation and launchers. - `plugins`: This directory contains the implementation of the supported custom ops. Some ops might also use shared cuda kernel in `common/cuda`.
## How to add new PyTorch ops? ## How to add new PyTorch ops?
1. (Optional) Add shared kernel in `common` to support special hardware platform. 1. (Optional) Add shared kernel in `common` to support special hardware platform.
```c++ ```c++
// src/common/cuda/new_ops_cuda_kernel.cuh // src/common/cuda/new_ops_cuda_kernel.cuh
template <typename T> template <typename T>
__global__ void new_ops_forward_cuda_kernel(const T* input, T* output, ...) { __global__ void new_ops_forward_cuda_kernel(const T* input, T* output, ...) {
// forward here // forward here
} }
``` ```
Add cuda kernel launcher in `pytorch/cuda`. Add cuda kernel launcher in `pytorch/cuda`.
```c++ ```c++
// src/pytorch/cuda // src/pytorch/cuda
#include <new_ops_cuda_kernel.cuh> #include <new_ops_cuda_kernel.cuh>
void NewOpsForwardCUDAKernelLauncher(Tensor input, Tensor output, ...){ void NewOpsForwardCUDAKernelLauncher(Tensor input, Tensor output, ...){
// initialize // initialize
at::cuda::CUDAGuard device_guard(input.device()); at::cuda::CUDAGuard device_guard(input.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
... ...
AT_DISPATCH_FLOATING_TYPES_AND_HALF( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "new_ops_forward_cuda_kernel", ([&] { input.scalar_type(), "new_ops_forward_cuda_kernel", ([&] {
new_ops_forward_cuda_kernel<scalar_t> new_ops_forward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>( <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
input.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),...); input.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),...);
})); }));
AT_CUDA_CHECK(cudaGetLastError()); AT_CUDA_CHECK(cudaGetLastError());
} }
``` ```
2. Register implementation for different devices. 2. Register implementation for different devices.
```c++ ```c++
// src/pytorch/cuda/cudabind.cpp // src/pytorch/cuda/cudabind.cpp
... ...
Tensor new_ops_forward_cuda(Tensor input, Tensor output, ...){ Tensor new_ops_forward_cuda(Tensor input, Tensor output, ...){
// implement cuda forward here // implement cuda forward here
// use `NewOpsForwardCUDAKernelLauncher` here // use `NewOpsForwardCUDAKernelLauncher` here
} }
// declare interface here. // declare interface here.
Tensor new_ops_forward_impl(Tensor input, Tensor output, ...); Tensor new_ops_forward_impl(Tensor input, Tensor output, ...);
// register the implementation for given device (CUDA here). // register the implementation for given device (CUDA here).
REGISTER_DEVICE_IMPL(new_ops_forward_impl, CUDA, new_ops_forward_cuda); REGISTER_DEVICE_IMPL(new_ops_forward_impl, CUDA, new_ops_forward_cuda);
``` ```
3. Add ops implementation in `pytorch` directory. Select different implementations according to device type. 3. Add ops implementation in `pytorch` directory. Select different implementations according to device type.
```c++ ```c++
// src/pytorch/new_ops.cpp // src/pytorch/new_ops.cpp
Tensor new_ops_forward_impl(Tensor input, Tensor output, ...){ Tensor new_ops_forward_impl(Tensor input, Tensor output, ...){
// dispatch the implementation according to the device type of input. // dispatch the implementation according to the device type of input.
DISPATCH_DEVICE_IMPL(new_ops_forward_impl, input, output, ...); DISPATCH_DEVICE_IMPL(new_ops_forward_impl, input, output, ...);
} }
... ...
Tensor new_ops_forward(Tensor input, Tensor output, ...){ Tensor new_ops_forward(Tensor input, Tensor output, ...){
return new_ops_forward_impl(input, output, ...); return new_ops_forward_impl(input, output, ...);
} }
``` ```
4. Binding the implementation in `pytorch/pybind.cpp` 4. Binding the implementation in `pytorch/pybind.cpp`
```c++ ```c++
// src/pytorch/pybind.cpp // src/pytorch/pybind.cpp
... ...
Tensor new_ops_forward(Tensor input, Tensor output, ...); Tensor new_ops_forward(Tensor input, Tensor output, ...);
... ...
// bind with pybind11 // bind with pybind11
m.def("new_ops_forward", &new_ops_forward, "new_ops_forward", m.def("new_ops_forward", &new_ops_forward, "new_ops_forward",
py::arg("input"), py::arg("output"), ...); py::arg("input"), py::arg("output"), ...);
... ...
``` ```
5. Build MMCV again. Enjoy new ops in python 5. Build MMCV again. Enjoy new ops in python
```python ```python
from ..utils import ext_loader from ..utils import ext_loader
ext_module = ext_loader.load_ext('_ext', ['new_ops_forward']) ext_module = ext_loader.load_ext('_ext', ['new_ops_forward'])
... ...
ext_module.new_ops_forward(input, output, ...) ext_module.new_ops_forward(input, output, ...)
``` ```
...@@ -220,10 +220,6 @@ HOST_DEVICE_INLINE int convex_hull_graham(const Point<T> (&p)[24], ...@@ -220,10 +220,6 @@ HOST_DEVICE_INLINE int convex_hull_graham(const Point<T> (&p)[24],
return temp > 0; return temp > 0;
} }
}); });
// compute distance to origin after sort, since the points are now different.
for (int i = 0; i < num_in; i++) {
dist[i] = dot_2d<T>(q[i], q[i]);
}
#endif #endif
// Step 4: // Step 4:
...@@ -270,17 +266,6 @@ HOST_DEVICE_INLINE int convex_hull_graham(const Point<T> (&p)[24], ...@@ -270,17 +266,6 @@ HOST_DEVICE_INLINE int convex_hull_graham(const Point<T> (&p)[24],
return m; return m;
} }
template <typename T>
HOST_DEVICE_INLINE T quadri_box_area(const Point<T> (&q)[4]) {
T area = 0;
#pragma unroll
for (int i = 1; i < 3; i++) {
area += fabs(cross_2d<T>(q[i] - q[0], q[i + 1] - q[0]));
}
return area / 2.0;
}
template <typename T> template <typename T>
HOST_DEVICE_INLINE T polygon_area(const Point<T> (&q)[24], const int& m) { HOST_DEVICE_INLINE T polygon_area(const Point<T> (&q)[24], const int& m) {
if (m <= 2) { if (m <= 2) {
...@@ -319,25 +304,6 @@ HOST_DEVICE_INLINE T rotated_boxes_intersection(const RotatedBox<T>& box1, ...@@ -319,25 +304,6 @@ HOST_DEVICE_INLINE T rotated_boxes_intersection(const RotatedBox<T>& box1,
return polygon_area<T>(orderedPts, num_convex); return polygon_area<T>(orderedPts, num_convex);
} }
template <typename T>
HOST_DEVICE_INLINE T quadri_boxes_intersection(const Point<T> (&pts1)[4],
const Point<T> (&pts2)[4]) {
// There are up to 4 x 4 + 4 + 4 = 24 intersections (including dups) returned
// from rotated_rect_intersection_pts
Point<T> intersectPts[24], orderedPts[24];
int num = get_intersection_points<T>(pts1, pts2, intersectPts);
if (num <= 2) {
return 0.0;
}
// Convex Hull to order the intersection points in clockwise order and find
// the contour area.
int num_convex = convex_hull_graham<T>(intersectPts, num, orderedPts, true);
return polygon_area<T>(orderedPts, num_convex);
}
} // namespace } // namespace
template <typename T> template <typename T>
...@@ -375,52 +341,3 @@ HOST_DEVICE_INLINE T single_box_iou_rotated(T const* const box1_raw, ...@@ -375,52 +341,3 @@ HOST_DEVICE_INLINE T single_box_iou_rotated(T const* const box1_raw,
const T iou = intersection / baseS; const T iou = intersection / baseS;
return iou; return iou;
} }
template <typename T>
HOST_DEVICE_INLINE T single_box_iou_quadri(T const* const pts1_raw,
T const* const pts2_raw,
const int mode_flag) {
// shift center to the middle point to achieve higher precision in result
Point<T> pts1[4], pts2[4];
auto center_shift_x =
(pts1_raw[0] + pts2_raw[0] + pts1_raw[2] + pts2_raw[2] + pts1_raw[4] +
pts2_raw[4] + pts1_raw[6] + pts2_raw[6]) /
8.0;
auto center_shift_y =
(pts1_raw[1] + pts2_raw[1] + pts1_raw[3] + pts2_raw[3] + pts1_raw[5] +
pts2_raw[5] + pts1_raw[7] + pts2_raw[7]) /
8.0;
pts1[0].x = pts1_raw[0] - center_shift_x;
pts1[0].y = pts1_raw[1] - center_shift_y;
pts1[1].x = pts1_raw[2] - center_shift_x;
pts1[1].y = pts1_raw[3] - center_shift_y;
pts1[2].x = pts1_raw[4] - center_shift_x;
pts1[2].y = pts1_raw[5] - center_shift_y;
pts1[3].x = pts1_raw[6] - center_shift_x;
pts1[3].y = pts1_raw[7] - center_shift_y;
pts2[0].x = pts2_raw[0] - center_shift_x;
pts2[0].y = pts2_raw[1] - center_shift_y;
pts2[1].x = pts2_raw[2] - center_shift_x;
pts2[1].y = pts2_raw[3] - center_shift_y;
pts2[2].x = pts2_raw[4] - center_shift_x;
pts2[2].y = pts2_raw[5] - center_shift_y;
pts2[3].x = pts2_raw[6] - center_shift_x;
pts2[3].y = pts2_raw[7] - center_shift_y;
const T area1 = quadri_box_area<T>(pts1);
const T area2 = quadri_box_area<T>(pts2);
if (area1 < 1e-14 || area2 < 1e-14) {
return 0.f;
}
const T intersection = quadri_boxes_intersection<T>(pts1, pts2);
T baseS = 1.0;
if (mode_flag == 0) {
baseS = (area1 + area2 - intersection);
} else if (mode_flag == 1) {
baseS = area1;
}
const T iou = intersection / baseS;
return iou;
}
// Copyright (c) OpenMMLab. All rights reserved.
// Modified from
// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/cuda/ActiveRotatingFilter_cuda.cu
#ifndef ACTIVE_ROTATED_FILTER_CUDA_KERNEL_CUH
#define ACTIVE_ROTATED_FILTER_CUDA_KERNEL_CUH
#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif
template <typename scalar_t>
__global__ void active_rotated_filter_forward_cuda_kernel(
const int nthreads, const scalar_t* weight_data, const int* indices_data,
const int num_input_planes, const int num_output_planes,
const int num_orientations, const int num_rotations, const int nEntry,
scalar_t* output_data) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
int l = index % nEntry;
int j = (index / nEntry) % num_input_planes;
int i = index / nEntry / num_input_planes;
int k;
scalar_t val = *(weight_data + index);
for (k = 0; k < num_rotations; k++) {
int idx = (int)(*(indices_data + l * num_rotations + k)) - 1;
scalar_t* target = output_data +
i * (num_rotations * num_input_planes * nEntry) +
k * (num_input_planes * nEntry) + j * (nEntry) + idx;
*target = val;
}
}
}
template <typename scalar_t>
__global__ void active_rotated_filter_backward_cuda_kernel(
const int nthreads, const scalar_t* gradWeight_data,
const int* indices_data, const int num_input_planes,
const int num_output_planes, const int num_orientations,
const int num_rotations, const int nEntry, scalar_t* weight_data) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
int l = index % nEntry;
int j = (index / nEntry) % num_input_planes;
int i = index / nEntry / num_input_planes;
int k;
scalar_t* val = weight_data + index;
*val = 0;
scalar_t tmp = 0;
for (k = 0; k < num_rotations; k++) {
int idx = (int)(*(indices_data + l * num_rotations + k)) - 1;
scalar_t target =
*(gradWeight_data + i * (num_rotations * num_input_planes * nEntry) +
k * (num_input_planes * nEntry) + j * (nEntry) + idx);
tmp = tmp + target;
}
*val = tmp;
}
}
#endif // ACTIVE_ROTATED_FILTER_CUDA_KERNEL_CUH
...@@ -22,34 +22,34 @@ __global__ void assign_score_withk_forward_cuda_kernel( ...@@ -22,34 +22,34 @@ __global__ void assign_score_withk_forward_cuda_kernel(
const int O, const int aggregate, const T* points, const T* centers, const int O, const int aggregate, const T* points, const T* centers,
const T* scores, const int64_t* knn_idx, T* output) { const T* scores, const int64_t* knn_idx, T* output) {
// ----- parallel loop for B, N1, K and O --------- // ----- parallel loop for B, N1, K and O ---------
CUDA_1D_KERNEL_LOOP(i, B * O * N1 * K) { long i = blockIdx.x * blockDim.x + threadIdx.x;
// ------- loop for M ---------- if (i >= B * N1 * K * O) return;
const int b = (int)(i / (O * N1 * K)); // ------- loop for M ----------
const int o = (int)(i % (O * N1 * K) / (N1 * K)); const int b = (int)(i / (O * N1 * K));
const int n = (int)(i % (N1 * K) / K); const int o = (int)(i % (O * N1 * K) / (N1 * K));
const int k = (int)(i % K); const int n = (int)(i % (N1 * K) / K);
const int cn = (int)knn_idx[b * K * N1 + n * K + const int k = (int)(i % K);
0]; // The first neighbor is the center point const int cn = (int)knn_idx[b * K * N1 + n * K +
const int kn = (int)knn_idx[b * K * N1 + n * K + k]; 0]; // The first neighbor is the center point
if (kn >= N0 || const int kn = (int)knn_idx[b * K * N1 + n * K + k];
kn < 0) { // if index overflows, it is out of the neighborhood range if (kn >= N0 ||
return; kn < 0) { // if index overflows, it is out of the neighborhood range
} return;
assert(b < B); }
assert(kn < N0); assert(b < B);
assert(cn < N0); assert(kn < N0);
assert(o < O); assert(cn < N0);
assert(n < N1); assert(o < O);
const int out_idx = b * N1 * O * K + o * N1 * K + n * K + k; assert(n < N1);
T val = output[out_idx]; const int out_idx = b * N1 * O * K + o * N1 * K + n * K + k;
for (int m = 0; m < M; m++) { T val = output[out_idx];
val += points[b * N0 * M * O + kn * M * O + m * O + o] * for (int m = 0; m < M; m++) {
scores[b * N1 * K * M + n * K * M + k * M + m] - val += points[b * N0 * M * O + kn * M * O + m * O + o] *
centers[b * N0 * M * O + cn * M * O + m * O + o] * scores[b * N1 * K * M + n * K * M + k * M + m] -
scores[b * N1 * K * M + n * K * M + k * M + m]; centers[b * N0 * M * O + cn * M * O + m * O + o] *
} scores[b * N1 * K * M + n * K * M + k * M + m];
output[out_idx] = val;
} }
output[out_idx] = val;
} }
template <typename T> template <typename T>
...@@ -58,27 +58,27 @@ __global__ void assign_score_withk_points_backward_cuda_kernel( ...@@ -58,27 +58,27 @@ __global__ void assign_score_withk_points_backward_cuda_kernel(
const int O, const int aggregate, const T* grad_out, const T* scores, const int O, const int aggregate, const T* grad_out, const T* scores,
const int64_t* knn_idx, T* grad_points, T* grad_centers) { const int64_t* knn_idx, T* grad_points, T* grad_centers) {
// ----- parallel loop for B, M, O --------- // ----- parallel loop for B, M, O ---------
CUDA_1D_KERNEL_LOOP(i, B * M * O) { long i = blockIdx.x * blockDim.x + threadIdx.x;
int b = (int)(i / (M * O)); if (i >= B * M * O) return;
int m = (int)(i % (M * O) / O); int b = (int)(i / (M * O));
int o = (int)(i % O); int m = (int)(i % (M * O) / O);
int o = (int)(i % O);
// ----- loop for N,K --------- // ----- loop for N,K ---------
for (int n = 0; n < N; n++) { for (int n = 0; n < N; n++) {
for (int k = 0; k < K; k++) { for (int k = 0; k < K; k++) {
int kn = knn_idx[b * N * K + n * K + k]; int kn = knn_idx[b * N * K + n * K + k];
int cn = knn_idx[b * N * K + n * K + 0]; int cn = knn_idx[b * N * K + n * K + 0];
if (kn >= N0 || kn < 0) { // if index overflows, it is out of the if (kn >= N0 ||
// neighborhood range kn < 0) { // if index overflows, it is out of the neighborhood range
continue; continue;
}
atomicAdd(grad_points + b * N0 * M * O + kn * M * O + m * O + o,
scores[b * N * K * M + n * K * M + k * M + m] *
grad_out[b * O * N * K + o * N * K + n * K + k]);
atomicAdd(grad_centers + b * N0 * M * O + cn * M * O + m * O + o,
-scores[b * N * K * M + n * K * M + k * M + m] *
grad_out[b * O * N * K + o * N * K + n * K + k]);
} }
atomicAdd(grad_points + b * N0 * M * O + kn * M * O + m * O + o,
scores[b * N * K * M + n * K * M + k * M + m] *
grad_out[b * O * N * K + o * N * K + n * K + k]);
atomicAdd(grad_centers + b * N0 * M * O + cn * M * O + m * O + o,
-scores[b * N * K * M + n * K * M + k * M + m] *
grad_out[b * O * N * K + o * N * K + n * K + k]);
} }
} }
} }
...@@ -89,28 +89,28 @@ __global__ void assign_score_withk_scores_backward_cuda_kernel( ...@@ -89,28 +89,28 @@ __global__ void assign_score_withk_scores_backward_cuda_kernel(
const int O, const int aggregate, const T* grad_out, const T* points, const int O, const int aggregate, const T* grad_out, const T* points,
const T* centers, const int64_t* knn_idx, T* grad_scores) { const T* centers, const int64_t* knn_idx, T* grad_scores) {
// ----- parallel loop for B, N, K, M --------- // ----- parallel loop for B, N, K, M ---------
CUDA_1D_KERNEL_LOOP(i, B * N * K * M) { long i = blockIdx.x * blockDim.x + threadIdx.x;
const int b = (int)(i / (N * M * K)); if (i >= B * N * K * M) return;
const int n = (int)(i % (N * M * K) / M / K); const int b = (int)(i / (N * M * K));
const int k = (int)(i % (M * K) / M); const int n = (int)(i % (N * M * K) / M / K);
const int m = (int)(i % M); const int k = (int)(i % (M * K) / M);
const int cn = knn_idx[b * N * K + n * K + 0]; const int m = (int)(i % M);
const int kn = knn_idx[b * N * K + n * K + k]; const int cn = knn_idx[b * N * K + n * K + 0];
if (kn >= N0 || const int kn = knn_idx[b * N * K + n * K + k];
kn < 0) { // if index overflows, it is out of the neighborhood range if (kn >= N0 ||
return; kn < 0) { // if index overflows, it is out of the neighborhood range
} return;
}
// -------------- loop for O ------------------------ // -------------- loop for O ------------------------
const int out_idx = b * N * K * M + n * K * M + k * M + m; const int out_idx = b * N * K * M + n * K * M + k * M + m;
T val = grad_scores[out_idx]; T val = grad_scores[out_idx];
for (int o = 0; o < O; o++) { for (int o = 0; o < O; o++) {
val += (points[b * N0 * M * O + kn * M * O + m * O + o] - val += (points[b * N0 * M * O + kn * M * O + m * O + o] -
centers[b * N0 * M * O + cn * M * O + m * O + o]) * centers[b * N0 * M * O + cn * M * O + m * O + o]) *
grad_out[b * O * N * K + o * N * K + n * K + k]; grad_out[b * O * N * K + o * N * K + n * K + k];
}
grad_scores[out_idx] = val;
} }
grad_scores[out_idx] = val;
} }
#endif // ASSIGN_SCORE_WITHK_CUDA_KERNEL_CUH #endif // ASSIGN_SCORE_WITHK_CUDA_KERNEL_CUH
...@@ -21,36 +21,35 @@ __global__ void ball_query_forward_cuda_kernel(int b, int n, int m, ...@@ -21,36 +21,35 @@ __global__ void ball_query_forward_cuda_kernel(int b, int n, int m,
// output: // output:
// idx: (B, M, nsample) // idx: (B, M, nsample)
int bs_idx = blockIdx.y; int bs_idx = blockIdx.y;
CUDA_1D_KERNEL_LOOP(pt_idx, m) { int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (bs_idx >= b) return; if (bs_idx >= b || pt_idx >= m) return;
new_xyz += bs_idx * m * 3 + pt_idx * 3; new_xyz += bs_idx * m * 3 + pt_idx * 3;
xyz += bs_idx * n * 3; xyz += bs_idx * n * 3;
idx += bs_idx * m * nsample + pt_idx * nsample; idx += bs_idx * m * nsample + pt_idx * nsample;
float max_radius2 = max_radius * max_radius; float max_radius2 = max_radius * max_radius;
float min_radius2 = min_radius * min_radius; float min_radius2 = min_radius * min_radius;
T new_x = new_xyz[0]; T new_x = new_xyz[0];
T new_y = new_xyz[1]; T new_y = new_xyz[1];
T new_z = new_xyz[2]; T new_z = new_xyz[2];
int cnt = 0; int cnt = 0;
for (int k = 0; k < n; ++k) { for (int k = 0; k < n; ++k) {
T x = xyz[k * 3 + 0]; T x = xyz[k * 3 + 0];
T y = xyz[k * 3 + 1]; T y = xyz[k * 3 + 1];
T z = xyz[k * 3 + 2]; T z = xyz[k * 3 + 2];
T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +
(new_z - z) * (new_z - z); (new_z - z) * (new_z - z);
if (d2 == 0 || (d2 >= min_radius2 && d2 < max_radius2)) { if (d2 == 0 || (d2 >= min_radius2 && d2 < max_radius2)) {
if (cnt == 0) { if (cnt == 0) {
for (int l = 0; l < nsample; ++l) { for (int l = 0; l < nsample; ++l) {
idx[l] = k; idx[l] = k;
}
} }
idx[cnt] = k;
++cnt;
if (cnt >= nsample) break;
} }
idx[cnt] = k;
++cnt;
if (cnt >= nsample) break;
} }
} }
} }
......
...@@ -8,27 +8,6 @@ ...@@ -8,27 +8,6 @@
#include "pytorch_cuda_helper.hpp" #include "pytorch_cuda_helper.hpp"
#endif #endif
template <typename T>
__device__ __forceinline__ void load_bbox(const T* bbox, const int base, T& x1,
T& y1, T& x2, T& y2) {
x1 = bbox[base];
y1 = bbox[base + 1];
x2 = bbox[base + 2];
y2 = bbox[base + 3];
}
template <>
__device__ __forceinline__ void load_bbox<float>(const float* bbox,
const int base, float& x1,
float& y1, float& x2,
float& y2) {
const float4 bbox_offset = reinterpret_cast<const float4*>(bbox + base)[0];
x1 = bbox_offset.x;
y1 = bbox_offset.y;
x2 = bbox_offset.z;
y2 = bbox_offset.w;
}
template <typename T> template <typename T>
__global__ void bbox_overlaps_cuda_kernel(const T* bbox1, const T* bbox2, __global__ void bbox_overlaps_cuda_kernel(const T* bbox1, const T* bbox2,
T* ious, const int num_bbox1, T* ious, const int num_bbox1,
...@@ -37,111 +16,69 @@ __global__ void bbox_overlaps_cuda_kernel(const T* bbox1, const T* bbox2, ...@@ -37,111 +16,69 @@ __global__ void bbox_overlaps_cuda_kernel(const T* bbox1, const T* bbox2,
const int offset) { const int offset) {
if (aligned) { if (aligned) {
CUDA_1D_KERNEL_LOOP(index, num_bbox1) { CUDA_1D_KERNEL_LOOP(index, num_bbox1) {
const int b1 = index; int b1 = index;
const int b2 = index; int b2 = index;
const int base1 = b1 << 2; // b1 * 4 int base1 = b1 * 4;
T b1_x1, b1_y1, b1_x2, b1_y2; T b1_x1 = bbox1[base1];
load_bbox<T>(bbox1, base1, b1_x1, b1_y1, b1_x2, b1_y2); T b1_y1 = bbox1[base1 + 1];
const T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset); T b1_x2 = bbox1[base1 + 2];
T b1_y2 = bbox1[base1 + 3];
const int base2 = b2 << 2; // b2 * 4 T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset);
T b2_x1, b2_y1, b2_x2, b2_y2;
load_bbox<T>(bbox2, base2, b2_x1, b2_y1, b2_x2, b2_y2); int base2 = b2 * 4;
const T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset); T b2_x1 = bbox2[base2];
T b2_y1 = bbox2[base2 + 1];
const T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2); T b2_x2 = bbox2[base2 + 2];
const T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2); T b2_y2 = bbox2[base2 + 3];
const T width = fmaxf(right - left + offset, 0.f); T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset);
const T height = fmaxf(bottom - top + offset, 0.f);
const T interS = width * height; T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2);
T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2);
const T baseS = T width = fmaxf(right - left + offset, 0.f);
fmaxf(mode == 0 ? b1_area + b2_area - interS : b1_area, T(offset)); T height = fmaxf(bottom - top + offset, 0.f);
T interS = width * height;
T baseS = 1.0;
if (mode == 0) {
baseS = fmaxf(b1_area + b2_area - interS, T(offset));
} else if (mode == 1) {
baseS = fmaxf(b1_area, T(offset));
}
ious[index] = interS / baseS; ious[index] = interS / baseS;
} }
} else { } else {
CUDA_1D_KERNEL_LOOP(index, num_bbox1 * num_bbox2) { CUDA_1D_KERNEL_LOOP(index, num_bbox1 * num_bbox2) {
const int b1 = index / num_bbox2; int b1 = index / num_bbox2;
const int b2 = index % num_bbox2; int b2 = index % num_bbox2;
const int base1 = b1 << 2; // b1 * 4 int base1 = b1 * 4;
T b1_x1, b1_y1, b1_x2, b1_y2; T b1_x1 = bbox1[base1];
load_bbox<T>(bbox1, base1, b1_x1, b1_y1, b1_x2, b1_y2); T b1_y1 = bbox1[base1 + 1];
const T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset); T b1_x2 = bbox1[base1 + 2];
T b1_y2 = bbox1[base1 + 3];
const int base2 = b2 << 2; // b2 * 4 T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset);
T b2_x1, b2_y1, b2_x2, b2_y2;
load_bbox<T>(bbox2, base2, b2_x1, b2_y1, b2_x2, b2_y2); int base2 = b2 * 4;
const T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset); T b2_x1 = bbox2[base2];
T b2_y1 = bbox2[base2 + 1];
const T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2); T b2_x2 = bbox2[base2 + 2];
const T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2); T b2_y2 = bbox2[base2 + 3];
const T width = fmaxf(right - left + offset, 0.f); T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset);
const T height = fmaxf(bottom - top + offset, 0.f);
const T interS = width * height; T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2);
T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2);
const T baseS = T width = fmaxf(right - left + offset, 0.f);
fmaxf(mode == 0 ? b1_area + b2_area - interS : b1_area, T(offset)); T height = fmaxf(bottom - top + offset, 0.f);
T interS = width * height;
T baseS = 1.0;
if (mode == 0) {
baseS = fmaxf(b1_area + b2_area - interS, T(offset));
} else if (mode == 1) {
baseS = fmaxf(b1_area, T(offset));
}
ious[index] = interS / baseS; ious[index] = interS / baseS;
} }
} }
} }
#if __CUDA_ARCH__ >= 530
__device__ __forceinline__ __half __half_area(const __half x1, const __half y1,
const __half x2, const __half y2,
const __half offset) {
const __half half_w = __hadd(__hsub(x2, x1), offset);
const __half half_h = __hadd(__hsub(y2, y1), offset);
return __hmul(half_w, half_h);
}
__device__ __forceinline__ __half __half_max(const __half a, const __half b) {
return __hge(a, b) ? a : b;
}
__device__ __forceinline__ __half __half_min(const __half a, const __half b) {
return __hle(a, b) ? a : b;
}
// fp16 won't provide much increase when aligned==true. It is useful when
// aligned==false, which would give you ~40% bonus.
__device__ void bbox_overlaps_cuda_kernel_half(
const __half* bbox1, const __half* bbox2, __half* ious, const int num_bbox1,
const int num_bbox2, const int mode, const bool aligned, const int offset) {
const int num_output = aligned ? num_bbox1 : num_bbox1 * num_bbox2;
const __half h_offset = __int2half_rn(offset);
CUDA_1D_KERNEL_LOOP(index, num_output) {
const int b1 = aligned ? index : index / num_bbox2;
const int b2 = aligned ? index : index % num_bbox2;
const int base1 = b1 << 2;
__half b1_x1, b1_y1, b1_x2, b1_y2;
load_bbox<__half>(bbox1, base1, b1_x1, b1_y1, b1_x2, b1_y2);
const __half b1_area = __half_area(b1_x1, b1_y1, b1_x2, b1_y2, h_offset);
const int base2 = b2 << 2;
__half b2_x1, b2_y1, b2_x2, b2_y2;
load_bbox<__half>(bbox2, base2, b2_x1, b2_y1, b2_x2, b2_y2);
const __half b2_area = __half_area(b2_x1, b2_y1, b2_x2, b2_y2, h_offset);
const __half left = __half_max(b1_x1, b2_x1),
right = __half_min(b1_x2, b2_x2);
const __half top = __half_max(b1_y1, b2_y1),
bottom = __half_min(b1_y2, b2_y2);
const __half width =
__half_max(__hadd(__hsub(right, left), h_offset), __float2half(0.f));
const __half height =
__half_max(__hadd(__hsub(bottom, top), h_offset), __float2half(0.f));
const __half interS = __hmul(width, height);
const __half baseS = __half_max(
mode == 0 ? __hsub(__hadd(b1_area, b2_area), interS) : b1_area,
h_offset);
ious[index] = __hdiv(interS, baseS);
}
}
#endif // __CUDA_ARCH__ >= 530
#endif // BBOX_OVERLAPS_CUDA_KERNEL_CUH #endif // BBOX_OVERLAPS_CUDA_KERNEL_CUH
// Copyright (c) OpenMMLab. All rights reserved
// Modified from
// https://github.com/aim-uofa/AdelaiDet/blob/master/adet/layers/csrc/BezierAlign/BezierAlign_cuda.cu
#ifndef BEZIER_ALIGN_CUDA_KERNEL_CUH
#define BEZIER_ALIGN_CUDA_KERNEL_CUH
#include <float.h>
#ifdef MMCV_WITH_TRT
#include "common_cuda_helper.hpp"
#else // MMCV_WITH_TRT
#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else // MMCV_USE_PARROTS
#include "pytorch_cuda_helper.hpp"
#endif // MMCV_USE_PARROTS
#endif // MMCV_WITH_TRT
template <typename T>
__device__ T bezier_curve(const T p0, const T p1, const T p2, const T p3,
const T u) {
return ((1. - u) * (1. - u) * (1. - u) * p0 +
3. * u * (1. - u) * (1. - u) * p1 + 3. * u * u * (1. - u) * p2 +
u * u * u * p3);
}
template <typename T>
__global__ void bezier_align_forward_cuda_kernel(
const int nthreads,
const T *bottom_data, // inputs
const T *bottom_rois, // bottom rois contains the bezier curve
T *top_data, // outputs
const int pooled_height, const int pooled_width, const T spatial_scale,
const int sampling_ratio, bool aligned, const int channels,
const int height, const int width) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int c = (index / pooled_width / pooled_height) % channels;
int n = index / pooled_width / pooled_height / channels;
// beziers have size Nx(1+8*2) = Nx17
const T *offset_bottom_rois = bottom_rois + n * 17;
int roi_batch_ind = offset_bottom_rois[0];
// Do not use rounding; this implementation detail is critical
T offset = aligned ? (T)0.5 : (T)0.0;
// TODO: avoid this by using parallel annotation, for good
T p0_x = offset_bottom_rois[1] * spatial_scale;
T p0_y = offset_bottom_rois[2] * spatial_scale;
T p1_x = offset_bottom_rois[3] * spatial_scale;
T p1_y = offset_bottom_rois[4] * spatial_scale;
T p2_x = offset_bottom_rois[5] * spatial_scale;
T p2_y = offset_bottom_rois[6] * spatial_scale;
T p3_x = offset_bottom_rois[7] * spatial_scale;
T p3_y = offset_bottom_rois[8] * spatial_scale;
T p4_x = offset_bottom_rois[15] * spatial_scale;
T p4_y = offset_bottom_rois[16] * spatial_scale;
T p5_x = offset_bottom_rois[13] * spatial_scale;
T p5_y = offset_bottom_rois[14] * spatial_scale;
T p6_x = offset_bottom_rois[11] * spatial_scale;
T p6_y = offset_bottom_rois[12] * spatial_scale;
T p7_x = offset_bottom_rois[9] * spatial_scale;
T p7_y = offset_bottom_rois[10] * spatial_scale;
// compute the coords
const T u = pw / static_cast<T>(pooled_width);
const T v = ph / static_cast<T>(pooled_height);
const T x0 = bezier_curve(p0_x, p1_x, p2_x, p3_x, u);
const T y0 = bezier_curve(p0_y, p1_y, p2_y, p3_y, u);
const T x1 = bezier_curve(p4_x, p5_x, p6_x, p7_x, u);
const T y1 = bezier_curve(p4_y, p5_y, p6_y, p7_y, u);
const T x_center = x1 * v + x0 * (1. - v) - offset;
const T y_center = y1 * v + y0 * (1. - v) - offset;
T roi_width = max(abs(p0_x - p3_x), abs(p4_x - p7_x));
T roi_height = max(abs(p0_y - p3_y), abs(p4_y - p7_y));
if (!aligned) { // for backward-compatibility only
roi_width = max(roi_width, (T)1.);
roi_height = max(roi_height, (T)1.);
}
T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
const T *offset_bottom_data =
bottom_data + (roi_batch_ind * channels + c) * height * width;
// We use roi_bin_grid to sample the grid and mimic integral
int roi_bin_grid_h = (sampling_ratio > 0)
? sampling_ratio
: ceil(roi_height / pooled_height); // e.g., = 2
int roi_bin_grid_w =
(sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
// We do average (integral) pooling inside a bin
// When the grid is empty, output zeros == 0/1, instead of NaN.
const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
T output_val = 0.;
for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
{
const T y = y_center - (T)0.5 * bin_size_h +
static_cast<T>(iy + .5f) * bin_size_h /
static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
const T x = x_center - (T)0.5 * bin_size_w +
static_cast<T>(ix + .5f) * bin_size_w /
static_cast<T>(roi_bin_grid_w);
T val = bilinear_interpolate(offset_bottom_data, height, width, y, x,
index);
output_val += val;
}
}
output_val /= count;
top_data[index] = output_val;
}
}
template <typename T>
__global__ void bezier_align_backward_cuda_kernel(
const int nthreads, const T *top_diff, const T *bottom_rois, T *bottom_diff,
const int pooled_height, const int pooled_width, const T spatial_scale,
const int sampling_ratio, bool aligned, const int channels,
const int height, const int width) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int c = (index / pooled_width / pooled_height) % channels;
int n = index / pooled_width / pooled_height / channels;
// beziers have size Nx(1+8*2) = Nx17
const T *offset_bottom_rois = bottom_rois + n * 17;
int roi_batch_ind = offset_bottom_rois[0];
// Do not use rounding; this implementation detail is critical
T offset = aligned ? (T)0.5 : (T)0.0;
T p0_x = offset_bottom_rois[1] * spatial_scale;
T p0_y = offset_bottom_rois[2] * spatial_scale;
T p1_x = offset_bottom_rois[3] * spatial_scale;
T p1_y = offset_bottom_rois[4] * spatial_scale;
T p2_x = offset_bottom_rois[5] * spatial_scale;
T p2_y = offset_bottom_rois[6] * spatial_scale;
T p3_x = offset_bottom_rois[7] * spatial_scale;
T p3_y = offset_bottom_rois[8] * spatial_scale;
T p4_x = offset_bottom_rois[15] * spatial_scale;
T p4_y = offset_bottom_rois[16] * spatial_scale;
T p5_x = offset_bottom_rois[13] * spatial_scale;
T p5_y = offset_bottom_rois[14] * spatial_scale;
T p6_x = offset_bottom_rois[11] * spatial_scale;
T p6_y = offset_bottom_rois[12] * spatial_scale;
T p7_x = offset_bottom_rois[9] * spatial_scale;
T p7_y = offset_bottom_rois[10] * spatial_scale;
// compute the coords
const T u = pw / static_cast<T>(pooled_width);
const T v = ph / static_cast<T>(pooled_height);
const T x0 = bezier_curve(p0_x, p1_x, p2_x, p3_x, u);
const T y0 = bezier_curve(p0_y, p1_y, p2_y, p3_y, u);
const T x1 = bezier_curve(p4_x, p5_x, p6_x, p7_x, u);
const T y1 = bezier_curve(p4_y, p5_y, p6_y, p7_y, u);
const T x_center = x1 * v + x0 * (1. - v) - offset;
const T y_center = y1 * v + y0 * (1. - v) - offset;
T roi_width = max(abs(p0_x - p3_x), abs(p4_x - p7_x));
T roi_height = max(abs(p0_y - p3_y), abs(p4_y - p7_y));
if (!aligned) { // for backward-compatibility only
roi_width = max(roi_width, (T)1.);
roi_height = max(roi_height, (T)1.);
}
T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
T *offset_bottom_diff =
bottom_diff + (roi_batch_ind * channels + c) * height * width;
int top_offset = (n * channels + c) * pooled_height * pooled_width;
const T *offset_top_diff = top_diff + top_offset;
const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
// We use roi_bin_grid to sample the grid and mimic integral
int roi_bin_grid_h = (sampling_ratio > 0)
? sampling_ratio
: ceil(roi_height / pooled_height); // e.g., = 2
int roi_bin_grid_w =
(sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
// We do average (integral) pooling inside a bin
const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
{
const T y = y_center - (T)0.5 * bin_size_h +
static_cast<T>(iy + .5f) * bin_size_h /
static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
const T x = x_center - (T)0.5 * bin_size_w +
static_cast<T>(ix + .5f) * bin_size_w /
static_cast<T>(roi_bin_grid_w);
T w1, w2, w3, w4;
int x_low, x_high, y_low, y_high;
bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
x_low, x_high, y_low, y_high, index);
T g1 = top_diff_this_bin * w1 / count;
T g2 = top_diff_this_bin * w2 / count;
T g3 = top_diff_this_bin * w3 / count;
T g4 = top_diff_this_bin * w4 / count;
if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
atomicAdd(offset_bottom_diff + y_low * width + x_low,
static_cast<T>(g1));
atomicAdd(offset_bottom_diff + y_low * width + x_high,
static_cast<T>(g2));
atomicAdd(offset_bottom_diff + y_high * width + x_low,
static_cast<T>(g3));
atomicAdd(offset_bottom_diff + y_high * width + x_high,
static_cast<T>(g4));
} // if
} // ix
} // iy
} // CUDA_1D_KERNEL_LOOP
} // BezierAlignBackward
#endif // BEZIER_ALIGN_CUDA_KERNEL_CUH
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#ifndef BOX_IOU_QUADRI_CUDA_CUH
#define BOX_IOU_QUADRI_CUDA_CUH
#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif
#include "box_iou_rotated_utils.hpp"
// 2D block with 32 * 16 = 512 threads per block
const int BLOCK_DIM_X = 32;
const int BLOCK_DIM_Y = 16;
inline int divideUP(const int x, const int y) { return (((x) + (y)-1) / (y)); }
template <typename T>
__global__ void box_iou_quadri_cuda_kernel(
const int n_boxes1, const int n_boxes2, const T* dev_boxes1,
const T* dev_boxes2, T* dev_ious, const int mode_flag, const bool aligned) {
if (aligned) {
CUDA_1D_KERNEL_LOOP(index, n_boxes1) {
int b1 = index;
int b2 = index;
int base1 = b1 * 8;
float block_boxes1[8];
float block_boxes2[8];
block_boxes1[0] = dev_boxes1[base1 + 0];
block_boxes1[1] = dev_boxes1[base1 + 1];
block_boxes1[2] = dev_boxes1[base1 + 2];
block_boxes1[3] = dev_boxes1[base1 + 3];
block_boxes1[4] = dev_boxes1[base1 + 4];
block_boxes1[5] = dev_boxes1[base1 + 5];
block_boxes1[6] = dev_boxes1[base1 + 6];
block_boxes1[7] = dev_boxes1[base1 + 7];
int base2 = b2 * 8;
block_boxes2[0] = dev_boxes2[base2 + 0];
block_boxes2[1] = dev_boxes2[base2 + 1];
block_boxes2[2] = dev_boxes2[base2 + 2];
block_boxes2[3] = dev_boxes2[base2 + 3];
block_boxes2[4] = dev_boxes2[base2 + 4];
block_boxes2[5] = dev_boxes2[base2 + 5];
block_boxes2[6] = dev_boxes2[base2 + 6];
block_boxes2[7] = dev_boxes2[base2 + 7];
dev_ious[index] =
single_box_iou_quadri<T>(block_boxes1, block_boxes2, mode_flag);
}
} else {
CUDA_1D_KERNEL_LOOP(index, n_boxes1 * n_boxes2) {
int b1 = index / n_boxes2;
int b2 = index % n_boxes2;
int base1 = b1 * 8;
float block_boxes1[8];
float block_boxes2[8];
block_boxes1[0] = dev_boxes1[base1 + 0];
block_boxes1[1] = dev_boxes1[base1 + 1];
block_boxes1[2] = dev_boxes1[base1 + 2];
block_boxes1[3] = dev_boxes1[base1 + 3];
block_boxes1[4] = dev_boxes1[base1 + 4];
block_boxes1[5] = dev_boxes1[base1 + 5];
block_boxes1[6] = dev_boxes1[base1 + 6];
block_boxes1[7] = dev_boxes1[base1 + 7];
int base2 = b2 * 8;
block_boxes2[0] = dev_boxes2[base2 + 0];
block_boxes2[1] = dev_boxes2[base2 + 1];
block_boxes2[2] = dev_boxes2[base2 + 2];
block_boxes2[3] = dev_boxes2[base2 + 3];
block_boxes2[4] = dev_boxes2[base2 + 4];
block_boxes2[5] = dev_boxes2[base2 + 5];
block_boxes2[6] = dev_boxes2[base2 + 6];
block_boxes2[7] = dev_boxes2[base2 + 7];
dev_ious[index] =
single_box_iou_quadri<T>(block_boxes1, block_boxes2, mode_flag);
}
}
}
#endif
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
#include "pytorch_cuda_helper.hpp" #include "pytorch_cuda_helper.hpp"
#endif #endif
#ifdef MMCV_WITH_HIP #ifdef HIP_DIFF
#define WARP_SIZE 64 #define WARP_SIZE 64
#else #else
#define WARP_SIZE 32 #define WARP_SIZE 32
...@@ -29,22 +29,22 @@ __device__ inline int Loc2Index(const int n, const int c, const int h, ...@@ -29,22 +29,22 @@ __device__ inline int Loc2Index(const int n, const int c, const int h,
int index = w + (h + (c + n * channel_num) * height) * width; int index = w + (h + (c + n * channel_num) * height) * width;
return index; return index;
} }
#ifndef MMCV_WITH_HIP #ifndef HIP_DIFF
/* TODO: move this to a common place */ /* TODO: move this to a common place */
template <typename scalar_t> template <typename scalar_t>
__device__ inline scalar_t min(scalar_t a, scalar_t b) { __device__ inline scalar_t mmcv_min(scalar_t a, scalar_t b) {
return a < b ? a : b; return a < b ? a : b;
} }
template <typename scalar_t> template <typename scalar_t>
__device__ inline scalar_t max(scalar_t a, scalar_t b) { __device__ inline scalar_t mmcv_max(scalar_t a, scalar_t b) {
return a > b ? a : b; return a > b ? a : b;
} }
#endif #endif
template <typename scalar_t> template <typename scalar_t>
__device__ __forceinline__ scalar_t warpReduceSum(scalar_t val) { __device__ __forceinline__ scalar_t warpReduceSum(scalar_t val) {
for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2)
#ifdef MMCV_WITH_HIP #ifdef HIP_DIFF
val += __shfl_down(val, offset); val += __shfl_down(val, offset);
#else #else
val += __shfl_down_sync(FULL_MASK, val, offset); val += __shfl_down_sync(FULL_MASK, val, offset);
...@@ -55,11 +55,11 @@ __device__ __forceinline__ scalar_t warpReduceSum(scalar_t val) { ...@@ -55,11 +55,11 @@ __device__ __forceinline__ scalar_t warpReduceSum(scalar_t val) {
template <> template <>
__device__ __forceinline__ phalf warpReduceSum(phalf val) { __device__ __forceinline__ phalf warpReduceSum(phalf val) {
for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2)
#ifdef MMCV_WITH_HIP #ifdef HIP_DIFF
__PHALF(val) += __shfl_down(val, offset); __PHALF(val) += __shfl_down(FULL_MASK, val, offset);
#else #else
__PHALF(val) += __PHALF(val) +=
__shfl_down_sync(FULL_MASK, __PHALF(val).operator __half(), offset); __shfl_down_sync(FULL_MASK, static_cast<__half>(__PHALF(val)), offset);
#endif #endif
return val; return val;
} }
...@@ -316,7 +316,7 @@ __global__ void CARAFEBackward_Mask(const int num_kernels, ...@@ -316,7 +316,7 @@ __global__ void CARAFEBackward_Mask(const int num_kernels,
output_val += top_diff[top_id] * bottom_data[bottom_id]; output_val += top_diff[top_id] * bottom_data[bottom_id];
} }
} }
#ifdef MMCV_WITH_HIP #ifdef HIP_DIFF
__syncthreads(); __syncthreads();
#else #else
__syncwarp(); __syncwarp();
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment