Commit fdeee889 authored by limm's avatar limm
Browse files

release v1.6.1 of mmcv

parent df465820
# Copyright (c) OpenMMLab. All rights reserved.
from .active_rotated_filter import active_rotated_filter
from .assign_score_withk import assign_score_withk
from .ball_query import ball_query
from .bbox import bbox_overlaps
......@@ -6,7 +7,9 @@ from .border_align import BorderAlign, border_align
from .box_iou_rotated import box_iou_rotated
from .carafe import CARAFE, CARAFENaive, CARAFEPack, carafe, carafe_naive
from .cc_attention import CrissCrossAttention
from .chamfer_distance import chamfer_distance
from .contour_expand import contour_expand
from .convex_iou import convex_giou, convex_iou
from .corner_pool import CornerPool
from .correlation import Correlation
from .deform_conv import DeformConv2d, DeformConv2dPack, deform_conv2d
......@@ -16,6 +19,7 @@ from .deprecated_wrappers import Conv2d_deprecated as Conv2d
from .deprecated_wrappers import ConvTranspose2d_deprecated as ConvTranspose2d
from .deprecated_wrappers import Linear_deprecated as Linear
from .deprecated_wrappers import MaxPool2d_deprecated as MaxPool2d
from .diff_iou_rotated import diff_iou_rotated_2d, diff_iou_rotated_3d
from .focal_loss import (SigmoidFocalLoss, SoftmaxFocalLoss,
sigmoid_focal_loss, softmax_focal_loss)
from .furthest_point_sample import (furthest_point_sample,
......@@ -25,9 +29,11 @@ from .gather_points import gather_points
from .group_points import GroupAll, QueryAndGroup, grouping_operation
from .info import (get_compiler_version, get_compiling_cuda_version,
get_onnxruntime_op_path)
from .iou3d import boxes_iou_bev, nms_bev, nms_normal_bev
from .iou3d import (boxes_iou3d, boxes_iou_bev, boxes_overlap_bev, nms3d,
nms3d_normal, nms_bev, nms_normal_bev)
from .knn import knn
from .masked_conv import MaskedConv2d, masked_conv2d
from .min_area_polygons import min_area_polygons
from .modulated_deform_conv import (ModulatedDeformConv2d,
ModulatedDeformConv2dPack,
modulated_deform_conv2d)
......@@ -38,15 +44,25 @@ from .point_sample import (SimpleRoIAlign, point_sample,
rel_roi_point_to_rel_img_point)
from .points_in_boxes import (points_in_boxes_all, points_in_boxes_cpu,
points_in_boxes_part)
from .points_in_polygons import points_in_polygons
from .points_sampler import PointsSampler
from .prroi_pool import PrRoIPool, prroi_pool
from .psa_mask import PSAMask
from .riroi_align_rotated import RiRoIAlignRotated, riroi_align_rotated
from .roi_align import RoIAlign, roi_align
from .roi_align_rotated import RoIAlignRotated, roi_align_rotated
from .roi_pool import RoIPool, roi_pool
from .roiaware_pool3d import RoIAwarePool3d
from .roipoint_pool3d import RoIPointPool3d
from .rotated_feature_align import rotated_feature_align
from .saconv import SAConv2d
from .scatter_points import DynamicScatter, dynamic_scatter
from .sparse_conv import (SparseConv2d, SparseConv3d, SparseConvTranspose2d,
SparseConvTranspose3d, SparseInverseConv2d,
SparseInverseConv3d, SubMConv2d, SubMConv3d)
from .sparse_modules import SparseModule, SparseSequential
from .sparse_pool import SparseMaxPool2d, SparseMaxPool3d
from .sparse_structure import SparseConvTensor, scatter_nd
from .sync_bn import SyncBatchNorm
from .three_interpolate import three_interpolate
from .three_nn import three_nn
......@@ -70,12 +86,21 @@ __all__ = [
'SAConv2d', 'TINShift', 'tin_shift', 'assign_score_withk',
'box_iou_rotated', 'RoIPointPool3d', 'nms_rotated', 'knn', 'ball_query',
'upfirdn2d', 'FusedBiasLeakyReLU', 'fused_bias_leakyrelu',
'rotated_feature_align', 'RiRoIAlignRotated', 'riroi_align_rotated',
'RoIAlignRotated', 'roi_align_rotated', 'pixel_group', 'QueryAndGroup',
'GroupAll', 'grouping_operation', 'contour_expand', 'three_nn',
'three_interpolate', 'MultiScaleDeformableAttention', 'BorderAlign',
'border_align', 'gather_points', 'furthest_point_sample',
'furthest_point_sample_with_dist', 'PointsSampler', 'Correlation',
'boxes_iou_bev', 'nms_bev', 'nms_normal_bev', 'Voxelization',
'voxelization', 'dynamic_scatter', 'DynamicScatter', 'RoIAwarePool3d',
'points_in_boxes_part', 'points_in_boxes_cpu', 'points_in_boxes_all'
'boxes_iou3d', 'boxes_iou_bev', 'boxes_overlap_bev', 'nms_bev',
'nms_normal_bev', 'nms3d', 'nms3d_normal', 'Voxelization', 'voxelization',
'dynamic_scatter', 'DynamicScatter', 'RoIAwarePool3d', 'SparseConv2d',
'SparseConv3d', 'SparseConvTranspose2d', 'SparseConvTranspose3d',
'SparseInverseConv2d', 'SparseInverseConv3d', 'SubMConv2d', 'SubMConv3d',
'SparseModule', 'SparseSequential', 'SparseMaxPool2d', 'SparseMaxPool3d',
'SparseConvTensor', 'scatter_nd', 'points_in_boxes_part',
'points_in_boxes_cpu', 'points_in_boxes_all', 'points_in_polygons',
'min_area_polygons', 'active_rotated_filter', 'convex_iou', 'convex_giou',
'diff_iou_rotated_2d', 'diff_iou_rotated_3d', 'chamfer_distance',
'PrRoIPool', 'prroi_pool'
]
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Tuple
import torch
from torch.autograd import Function
from torch.autograd.function import once_differentiable
from ..utils import ext_loader
ext_module = ext_loader.load_ext(
'_ext',
['active_rotated_filter_forward', 'active_rotated_filter_backward'])
class ActiveRotatedFilterFunction(Function):
"""Encoding the orientation information and generating orientation-
sensitive features.
The details are described in the paper `Align Deep Features for Oriented
Object Detection <https://arxiv.org/abs/2008.09397>_`.
"""
@staticmethod
def forward(ctx, input: torch.Tensor,
indices: torch.Tensor) -> torch.Tensor:
"""
Args:
input (torch.Tensor): Input features with shape
[num_output_planes, num_input_planes, num_orientations, H, W].
indices (torch.Tensor): Indices with shape
[num_orientations, H, W, num_rotations].
Returns:
torch.Tensor: Refined features with shape [num_output_planes *
num_rotations, num_input_planes * num_orientations, H, W].
"""
ctx.save_for_backward(input, indices)
op, ip, o, h, w = input.size()
o, h, w, r = indices.size()
output = input.new_zeros((op * r, ip * o, h, w))
ext_module.active_rotated_filter_forward(input, indices, output)
return output
@staticmethod
@once_differentiable
def backward(ctx, grad_out: torch.Tensor) -> Tuple[torch.Tensor, None]:
"""
Args:
grad_output (torch.Tensor): The gradiant of output features
with shape [num_output_planes * num_rotations,
num_input_planes * num_orientations, H, W].
Returns:
torch.Tensor: The gradiant of input features with shape
[num_output_planes, num_input_planes, num_orientations, H, W].
"""
input, indices = ctx.saved_tensors
grad_in = torch.zeros_like(input)
ext_module.active_rotated_filter_backward(grad_out, indices, grad_in)
return grad_in, None
active_rotated_filter = ActiveRotatedFilterFunction.apply
from typing import Tuple
import torch
from torch.autograd import Function
from ..utils import ext_loader
......@@ -27,11 +30,11 @@ class AssignScoreWithK(Function):
@staticmethod
def forward(ctx,
scores,
point_features,
center_features,
knn_idx,
aggregate='sum'):
scores: torch.Tensor,
point_features: torch.Tensor,
center_features: torch.Tensor,
knn_idx: torch.Tensor,
aggregate: str = 'sum') -> torch.Tensor:
"""
Args:
scores (torch.Tensor): (B, npoint, K, M), predicted scores to
......@@ -78,15 +81,20 @@ class AssignScoreWithK(Function):
return output
@staticmethod
def backward(ctx, grad_out):
def backward(
ctx, grad_out: torch.Tensor
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, None, None]:
"""
Args:
grad_out (torch.Tensor): (B, out_dim, npoint, K)
Returns:
grad_scores (torch.Tensor): (B, npoint, K, M)
grad_point_features (torch.Tensor): (B, N, M, out_dim)
grad_center_features (torch.Tensor): (B, N, M, out_dim)
tuple[torch.Tensor]: A tuple contains five elements. The first one
is the gradient of ``scores`` whose shape is (B, npoint, K, M). The
second is the gradient of ``point_features`` whose shape is
(B, N, M, out_dim). The third is the gradient of
``center_features`` with the shape of (B, N, M, out_dim). The last
two are ``None``.
"""
_, point_features, center_features, scores, knn_idx = ctx.saved_tensors
......
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Tuple
import torch
from torch.autograd import Function
......@@ -18,12 +20,13 @@ class BallQuery(Function):
min_radius (float): minimum radius of the balls.
max_radius (float): maximum radius of the balls.
sample_num (int): maximum number of features in the balls.
xyz (Tensor): (B, N, 3) xyz coordinates of the features.
center_xyz (Tensor): (B, npoint, 3) centers of the ball query.
xyz (torch.Tensor): (B, N, 3) xyz coordinates of the features.
center_xyz (torch.Tensor): (B, npoint, 3) centers of the ball
query.
Returns:
Tensor: (B, npoint, nsample) tensor with the indices of
the features that form the query balls.
torch.Tensor: (B, npoint, nsample) tensor with the indices of the
features that form the query balls.
"""
assert center_xyz.is_contiguous()
assert xyz.is_contiguous()
......@@ -48,7 +51,7 @@ class BallQuery(Function):
return idx
@staticmethod
def backward(ctx, a=None):
def backward(ctx, a=None) -> Tuple[None, None, None, None]:
return None, None, None, None
......
# Copyright (c) OpenMMLab. All rights reserved.
import torch
from ..utils import ext_loader
ext_module = ext_loader.load_ext('_ext', ['bbox_overlaps'])
def bbox_overlaps(bboxes1, bboxes2, mode='iou', aligned=False, offset=0):
def _bbox_overlaps_cpu(bboxes1: torch.Tensor,
bboxes2: torch.Tensor,
mode: str = 'iou',
aligned: bool = False,
offset: int = 0) -> torch.Tensor:
assert mode in ['iou', 'iof']
if aligned:
lt = torch.max(bboxes1[:, :2], bboxes2[:, :2]) # [rows, 2]
rb = torch.min(bboxes1[:, 2:], bboxes2[:, 2:]) # [rows, 2]
wh = (rb - lt + offset).clamp(min=0) # [rows, 2]
overlap = wh[:, 0] * wh[:, 1]
area1 = (bboxes1[:, 2] - bboxes1[:, 0] + offset) * (
bboxes1[:, 3] - bboxes1[:, 1] + offset)
if mode == 'iou':
area2 = (bboxes2[:, 2] - bboxes2[:, 0] + offset) * (
bboxes2[:, 3] - bboxes2[:, 1] + offset)
ious = overlap / (area1 + area2 - overlap)
else:
ious = overlap / area1
else:
lt = torch.max(bboxes1[:, None, :2], bboxes2[:, :2]) # [rows, cols, 2]
rb = torch.min(bboxes1[:, None, 2:], bboxes2[:, 2:]) # [rows, cols, 2]
wh = (rb - lt + offset).clamp(min=0) # [rows, cols, 2]
overlap = wh[:, :, 0] * wh[:, :, 1]
area1 = (bboxes1[:, 2] - bboxes1[:, 0] + offset) * (
bboxes1[:, 3] - bboxes1[:, 1] + offset)
if mode == 'iou':
area2 = (bboxes2[:, 2] - bboxes2[:, 0] + offset) * (
bboxes2[:, 3] - bboxes2[:, 1] + offset)
ious = overlap / (area1[:, None] + area2 - overlap)
else:
ious = overlap / (area1[:, None])
return ious
def bbox_overlaps(bboxes1: torch.Tensor,
bboxes2: torch.Tensor,
mode: str = 'iou',
aligned: bool = False,
offset: int = 0) -> torch.Tensor:
"""Calculate overlap between two set of bboxes.
If ``aligned`` is ``False``, then calculate the ious between each bbox
......@@ -12,14 +59,16 @@ def bbox_overlaps(bboxes1, bboxes2, mode='iou', aligned=False, offset=0):
bboxes1 and bboxes2.
Args:
bboxes1 (Tensor): shape (m, 4) in <x1, y1, x2, y2> format or empty.
bboxes2 (Tensor): shape (n, 4) in <x1, y1, x2, y2> format or empty.
If aligned is ``True``, then m and n must be equal.
bboxes1 (torch.Tensor): shape (m, 4) in <x1, y1, x2, y2> format or
empty.
bboxes2 (torch.Tensor): shape (n, 4) in <x1, y1, x2, y2> format or
empty. If aligned is ``True``, then m and n must be equal.
mode (str): "iou" (intersection over union) or iof (intersection over
foreground).
Returns:
ious(Tensor): shape (m, n) if aligned == False else shape (m, 1)
torch.Tensor: Return the ious betweens boxes. If ``aligned`` is
``False``, the shape of ious is (m, n) else (m, 1).
Example:
>>> bboxes1 = torch.FloatTensor([
......@@ -63,10 +112,19 @@ def bbox_overlaps(bboxes1, bboxes2, mode='iou', aligned=False, offset=0):
if rows * cols == 0:
return bboxes1.new(rows, 1) if aligned else bboxes1.new(rows, cols)
if aligned:
ious = bboxes1.new_zeros(rows)
if bboxes1.device.type == 'cpu':
return _bbox_overlaps_cpu(
bboxes1, bboxes2, mode=mode, aligned=aligned, offset=offset)
else:
ious = bboxes1.new_zeros((rows, cols))
ext_module.bbox_overlaps(
bboxes1, bboxes2, ious, mode=mode_flag, aligned=aligned, offset=offset)
return ious
if aligned:
ious = bboxes1.new_zeros(rows)
else:
ious = bboxes1.new_zeros((rows, cols))
ext_module.bbox_overlaps(
bboxes1,
bboxes2,
ious,
mode=mode_flag,
aligned=aligned,
offset=offset)
return ious
......@@ -2,6 +2,8 @@
# modified from
# https://github.com/Megvii-BaseDetection/cvpods/blob/master/cvpods/layers/border_align.py
from typing import Tuple
import torch
import torch.nn as nn
from torch.autograd import Function
......@@ -21,7 +23,8 @@ class BorderAlignFunction(Function):
'mmcv::MMCVBorderAlign', input, boxes, pool_size_i=pool_size)
@staticmethod
def forward(ctx, input, boxes, pool_size):
def forward(ctx, input: torch.Tensor, boxes: torch.Tensor,
pool_size: int) -> torch.Tensor:
ctx.pool_size = pool_size
ctx.input_shape = input.size()
......@@ -45,7 +48,8 @@ class BorderAlignFunction(Function):
@staticmethod
@once_differentiable
def backward(ctx, grad_output):
def backward(ctx,
grad_output: torch.Tensor) -> Tuple[torch.Tensor, None, None]:
boxes, argmax_idx = ctx.saved_tensors
grad_input = grad_output.new_zeros(ctx.input_shape)
# complex head architecture may cause grad_output uncontiguous
......@@ -72,24 +76,25 @@ class BorderAlign(nn.Module):
For each border line (e.g. top, left, bottom or right) of each box,
border_align does the following:
1. uniformly samples `pool_size`+1 positions on this line, involving \
the start and end points.
2. the corresponding features on these points are computed by \
bilinear interpolation.
3. max pooling over all the `pool_size`+1 positions are used for \
computing pooled feature.
1. uniformly samples ``pool_size`` +1 positions on this line, involving
the start and end points.
2. the corresponding features on these points are computed by bilinear
interpolation.
3. max pooling over all the ``pool_size`` +1 positions are used for
computing pooled feature.
Args:
pool_size (int): number of positions sampled over the boxes' borders
(e.g. top, bottom, left, right).
"""
def __init__(self, pool_size):
super(BorderAlign, self).__init__()
def __init__(self, pool_size: int):
super().__init__()
self.pool_size = pool_size
def forward(self, input, boxes):
def forward(self, input: torch.Tensor,
boxes: torch.Tensor) -> torch.Tensor:
"""
Args:
input: Features with shape [N,4C,H,W]. Channels ranged in [0,C),
......@@ -98,8 +103,8 @@ class BorderAlign(nn.Module):
boxes: Boxes with shape [N,H*W,4]. Coordinate format (x1,y1,x2,y2).
Returns:
Tensor: Pooled features with shape [N,C,H*W,4]. The order is
(top,left,bottom,right) for the last dimension.
torch.Tensor: Pooled features with shape [N,C,H*W,4]. The order is
(top,left,bottom,right) for the last dimension.
"""
return border_align(input, boxes, self.pool_size)
......
# Copyright (c) OpenMMLab. All rights reserved.
import torch
from ..utils import ext_loader
ext_module = ext_loader.load_ext('_ext', ['box_iou_rotated'])
def box_iou_rotated(bboxes1, bboxes2, mode='iou', aligned=False):
def box_iou_rotated(bboxes1: torch.Tensor,
bboxes2: torch.Tensor,
mode: str = 'iou',
aligned: bool = False,
clockwise: bool = True) -> torch.Tensor:
"""Return intersection-over-union (Jaccard index) of boxes.
Both sets of boxes are expected to be in
......@@ -14,18 +20,110 @@ def box_iou_rotated(bboxes1, bboxes2, mode='iou', aligned=False):
of bboxes1 and bboxes2, otherwise the ious between each aligned pair of
bboxes1 and bboxes2.
Arguments:
boxes1 (Tensor): rotated bboxes 1. \
It has shape (N, 5), indicating (x, y, w, h, theta) for each row.
Note that theta is in radian.
boxes2 (Tensor): rotated bboxes 2. \
It has shape (M, 5), indicating (x, y, w, h, theta) for each row.
Note that theta is in radian.
.. note::
The operator assumes:
1) The positive direction along x axis is left -> right.
2) The positive direction along y axis is top -> down.
3) The w border is in parallel with x axis when angle = 0.
However, there are 2 opposite definitions of the positive angular
direction, clockwise (CW) and counter-clockwise (CCW). MMCV supports
both definitions and uses CW by default.
Please set ``clockwise=False`` if you are using the CCW definition.
The coordinate system when ``clockwise`` is ``True`` (default)
.. code-block:: none
0-------------------> x (0 rad)
| A-------------B
| | |
| | box h
| | angle=0 |
| D------w------C
v
y (pi/2 rad)
In such coordination system the rotation matrix is
.. math::
\\begin{pmatrix}
\\cos\\alpha & -\\sin\\alpha \\\\
\\sin\\alpha & \\cos\\alpha
\\end{pmatrix}
The coordinates of the corner point A can be calculated as:
.. math::
P_A=
\\begin{pmatrix} x_A \\\\ y_A\\end{pmatrix}
=
\\begin{pmatrix} x_{center} \\\\ y_{center}\\end{pmatrix} +
\\begin{pmatrix}\\cos\\alpha & -\\sin\\alpha \\\\
\\sin\\alpha & \\cos\\alpha\\end{pmatrix}
\\begin{pmatrix} -0.5w \\\\ -0.5h\\end{pmatrix} \\\\
=
\\begin{pmatrix} x_{center}-0.5w\\cos\\alpha+0.5h\\sin\\alpha
\\\\
y_{center}-0.5w\\sin\\alpha-0.5h\\cos\\alpha\\end{pmatrix}
The coordinate system when ``clockwise`` is ``False``
.. code-block:: none
0-------------------> x (0 rad)
| A-------------B
| | |
| | box h
| | angle=0 |
| D------w------C
v
y (-pi/2 rad)
In such coordination system the rotation matrix is
.. math::
\\begin{pmatrix}
\\cos\\alpha & \\sin\\alpha \\\\
-\\sin\\alpha & \\cos\\alpha
\\end{pmatrix}
The coordinates of the corner point A can be calculated as:
.. math::
P_A=
\\begin{pmatrix} x_A \\\\ y_A\\end{pmatrix}
=
\\begin{pmatrix} x_{center} \\\\ y_{center}\\end{pmatrix} +
\\begin{pmatrix}\\cos\\alpha & \\sin\\alpha \\\\
-\\sin\\alpha & \\cos\\alpha\\end{pmatrix}
\\begin{pmatrix} -0.5w \\\\ -0.5h\\end{pmatrix} \\\\
=
\\begin{pmatrix} x_{center}-0.5w\\cos\\alpha-0.5h\\sin\\alpha
\\\\
y_{center}+0.5w\\sin\\alpha-0.5h\\cos\\alpha\\end{pmatrix}
Args:
boxes1 (torch.Tensor): rotated bboxes 1. It has shape (N, 5),
indicating (x, y, w, h, theta) for each row. Note that theta is in
radian.
boxes2 (torch.Tensor): rotated bboxes 2. It has shape (M, 5),
indicating (x, y, w, h, theta) for each row. Note that theta is in
radian.
mode (str): "iou" (intersection over union) or iof (intersection over
foreground).
clockwise (bool): flag indicating whether the positive angular
orientation is clockwise. default True.
`New in version 1.4.3.`
Returns:
ious(Tensor): shape (N, M) if aligned == False else shape (N,)
torch.Tensor: Return the ious betweens boxes. If ``aligned`` is
``False``, the shape of ious is (N, M) else (N,).
"""
assert mode in ['iou', 'iof']
mode_dict = {'iou': 0, 'iof': 1}
......@@ -35,7 +133,12 @@ def box_iou_rotated(bboxes1, bboxes2, mode='iou', aligned=False):
if aligned:
ious = bboxes1.new_zeros(rows)
else:
ious = bboxes1.new_zeros((rows * cols))
ious = bboxes1.new_zeros(rows * cols)
if not clockwise:
flip_mat = bboxes1.new_ones(bboxes1.shape[-1])
flip_mat[-1] = -1
bboxes1 = bboxes1 * flip_mat
bboxes2 = bboxes2 * flip_mat
bboxes1 = bboxes1.contiguous()
bboxes2 = bboxes2.contiguous()
ext_module.box_iou_rotated(
......
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Tuple
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor
from torch.autograd import Function
from torch.nn.modules.module import Module
......@@ -17,7 +20,8 @@ ext_module = ext_loader.load_ext('_ext', [
class CARAFENaiveFunction(Function):
@staticmethod
def symbolic(g, features, masks, kernel_size, group_size, scale_factor):
def symbolic(g, features: Tensor, masks: Tensor, kernel_size: int,
group_size: int, scale_factor: int) -> Tensor:
return g.op(
'mmcv::MMCVCARAFENaive',
features,
......@@ -27,7 +31,8 @@ class CARAFENaiveFunction(Function):
scale_factor_f=scale_factor)
@staticmethod
def forward(ctx, features, masks, kernel_size, group_size, scale_factor):
def forward(ctx, features: Tensor, masks: Tensor, kernel_size: int,
group_size: int, scale_factor: int) -> Tensor:
assert scale_factor >= 1
assert masks.size(1) == kernel_size * kernel_size * group_size
assert masks.size(-1) == features.size(-1) * scale_factor
......@@ -50,12 +55,15 @@ class CARAFENaiveFunction(Function):
group_size=group_size,
scale_factor=scale_factor)
if features.requires_grad or masks.requires_grad:
if features.requires_grad or masks.requires_grad or \
torch.__version__ == 'parrots':
ctx.save_for_backward(features, masks)
return output
@staticmethod
def backward(ctx, grad_output):
def backward(
ctx,
grad_output: Tensor) -> Tuple[Tensor, Tensor, None, None, None]:
assert grad_output.is_cuda
features, masks = ctx.saved_tensors
......@@ -83,8 +91,8 @@ carafe_naive = CARAFENaiveFunction.apply
class CARAFENaive(Module):
def __init__(self, kernel_size, group_size, scale_factor):
super(CARAFENaive, self).__init__()
def __init__(self, kernel_size: int, group_size: int, scale_factor: int):
super().__init__()
assert isinstance(kernel_size, int) and isinstance(
group_size, int) and isinstance(scale_factor, int)
......@@ -92,7 +100,7 @@ class CARAFENaive(Module):
self.group_size = group_size
self.scale_factor = scale_factor
def forward(self, features, masks):
def forward(self, features: Tensor, masks: Tensor) -> Tensor:
return carafe_naive(features, masks, self.kernel_size, self.group_size,
self.scale_factor)
......@@ -100,7 +108,8 @@ class CARAFENaive(Module):
class CARAFEFunction(Function):
@staticmethod
def symbolic(g, features, masks, kernel_size, group_size, scale_factor):
def symbolic(g, features: Tensor, masks: Tensor, kernel_size: int,
group_size: int, scale_factor: int) -> Tensor:
return g.op(
'mmcv::MMCVCARAFE',
features,
......@@ -110,7 +119,8 @@ class CARAFEFunction(Function):
scale_factor_f=scale_factor)
@staticmethod
def forward(ctx, features, masks, kernel_size, group_size, scale_factor):
def forward(ctx, features: Tensor, masks: Tensor, kernel_size: int,
group_size: int, scale_factor: int) -> Tensor:
assert scale_factor >= 1
assert masks.size(1) == kernel_size * kernel_size * group_size
assert masks.size(-1) == features.size(-1) * scale_factor
......@@ -139,12 +149,15 @@ class CARAFEFunction(Function):
group_size=group_size,
scale_factor=scale_factor)
if features.requires_grad or masks.requires_grad:
if features.requires_grad or masks.requires_grad or \
torch.__version__ == 'parrots':
ctx.save_for_backward(features, masks, rfeatures)
return output
@staticmethod
def backward(ctx, grad_output):
def backward(
ctx,
grad_output: Tensor) -> Tuple[Tensor, Tensor, None, None, None]:
assert grad_output.is_cuda
features, masks, rfeatures = ctx.saved_tensors
......@@ -180,7 +193,8 @@ carafe = CARAFEFunction.apply
class CARAFE(Module):
""" CARAFE: Content-Aware ReAssembly of FEatures
Please refer to https://arxiv.org/abs/1905.02188 for more details.
Please refer to `CARAFE: Content-Aware ReAssembly of FEatures
<https://arxiv.org/abs/1905.02188>`_ for more details.
Args:
kernel_size (int): reassemble kernel size
......@@ -191,8 +205,8 @@ class CARAFE(Module):
upsampled feature map
"""
def __init__(self, kernel_size, group_size, scale_factor):
super(CARAFE, self).__init__()
def __init__(self, kernel_size: int, group_size: int, scale_factor: int):
super().__init__()
assert isinstance(kernel_size, int) and isinstance(
group_size, int) and isinstance(scale_factor, int)
......@@ -200,7 +214,7 @@ class CARAFE(Module):
self.group_size = group_size
self.scale_factor = scale_factor
def forward(self, features, masks):
def forward(self, features: Tensor, masks: Tensor) -> Tensor:
return carafe(features, masks, self.kernel_size, self.group_size,
self.scale_factor)
......@@ -211,8 +225,8 @@ class CARAFEPack(nn.Module):
compressor 2) content encoder 3) CARAFE op.
Official implementation of ICCV 2019 paper
CARAFE: Content-Aware ReAssembly of FEatures
Please refer to https://arxiv.org/abs/1905.02188 for more details.
`CARAFE: Content-Aware ReAssembly of FEatures
<https://arxiv.org/abs/1905.02188>`_.
Args:
channels (int): input feature channels
......@@ -228,14 +242,14 @@ class CARAFEPack(nn.Module):
"""
def __init__(self,
channels,
scale_factor,
up_kernel=5,
up_group=1,
encoder_kernel=3,
encoder_dilation=1,
compressed_channels=64):
super(CARAFEPack, self).__init__()
channels: int,
scale_factor: int,
up_kernel: int = 5,
up_group: int = 1,
encoder_kernel: int = 3,
encoder_dilation: int = 1,
compressed_channels: int = 64):
super().__init__()
self.channels = channels
self.scale_factor = scale_factor
self.up_kernel = up_kernel
......@@ -261,7 +275,7 @@ class CARAFEPack(nn.Module):
xavier_init(m, distribution='uniform')
normal_init(self.content_encoder, std=0.001)
def kernel_normalizer(self, mask):
def kernel_normalizer(self, mask: Tensor) -> Tensor:
mask = F.pixel_shuffle(mask, self.scale_factor)
n, mask_c, h, w = mask.size()
# use float division explicitly,
......@@ -274,11 +288,11 @@ class CARAFEPack(nn.Module):
return mask
def feature_reassemble(self, x, mask):
def feature_reassemble(self, x: Tensor, mask: Tensor) -> Tensor:
x = carafe(x, mask, self.up_kernel, self.up_group, self.scale_factor)
return x
def forward(self, x):
def forward(self, x: Tensor) -> Tensor:
compressed_x = self.channel_compressor(x)
mask = self.content_encoder(compressed_x)
mask = self.kernel_normalizer(mask)
......
......@@ -6,7 +6,7 @@ import torch.nn.functional as F
from mmcv.cnn import PLUGIN_LAYERS, Scale
def NEG_INF_DIAG(n, device):
def NEG_INF_DIAG(n: int, device: torch.device) -> torch.Tensor:
"""Returns a diagonal matrix of size [n, n].
The diagonal are all "-inf". This is for avoiding calculating the
......@@ -41,7 +41,7 @@ class CrissCrossAttention(nn.Module):
in_channels (int): Channels of the input feature map.
"""
def __init__(self, in_channels):
def __init__(self, in_channels: int) -> None:
super().__init__()
self.query_conv = nn.Conv2d(in_channels, in_channels // 8, 1)
self.key_conv = nn.Conv2d(in_channels, in_channels // 8, 1)
......@@ -49,14 +49,15 @@ class CrissCrossAttention(nn.Module):
self.gamma = Scale(0.)
self.in_channels = in_channels
def forward(self, x):
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""forward function of Criss-Cross Attention.
Args:
x (Tensor): Input feature. \
shape (batch_size, in_channels, height, width)
x (torch.Tensor): Input feature with the shape of
(batch_size, in_channels, height, width).
Returns:
Tensor: Output of the layer, with shape of \
torch.Tensor: Output of the layer, with the shape of
(batch_size, in_channels, height, width)
"""
B, C, H, W = x.size()
......@@ -77,7 +78,7 @@ class CrissCrossAttention(nn.Module):
return out
def __repr__(self):
def __repr__(self) -> str:
s = self.__class__.__name__
s += f'(in_channels={self.in_channels})'
return s
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Sequence, Tuple
import torch
from torch import Tensor
from torch.autograd import Function
from torch.autograd.function import once_differentiable
from ..utils import ext_loader
ext_module = ext_loader.load_ext(
'_ext', ['chamfer_distance_forward', 'chamfer_distance_backward'])
class ChamferDistanceFunction(Function):
"""This is an implementation of the 2D Chamfer Distance.
It has been used in the paper `Oriented RepPoints for Aerial Object
Detection (CVPR 2022) <https://arxiv.org/abs/2105.11111>_`.
"""
@staticmethod
def forward(ctx, xyz1: Tensor, xyz2: Tensor) -> Sequence[Tensor]:
"""
Args:
xyz1 (Tensor): Point set with shape (B, N, 2).
xyz2 (Tensor): Point set with shape (B, N, 2).
Returns:
Sequence[Tensor]:
- dist1 (Tensor): Chamfer distance (xyz1 to xyz2) with
shape (B, N).
- dist2 (Tensor): Chamfer distance (xyz2 to xyz1) with
shape (B, N).
- idx1 (Tensor): Index of chamfer distance (xyz1 to xyz2)
with shape (B, N), which be used in compute gradient.
- idx2 (Tensor): Index of chamfer distance (xyz2 to xyz2)
with shape (B, N), which be used in compute gradient.
"""
batch_size, n, _ = xyz1.size()
_, m, _ = xyz2.size()
device = xyz1.device
xyz1 = xyz1.contiguous()
xyz2 = xyz2.contiguous()
dist1 = torch.zeros(batch_size, n).to(device)
dist2 = torch.zeros(batch_size, m).to(device)
idx1 = torch.zeros(batch_size, n).type(torch.IntTensor).to(device)
idx2 = torch.zeros(batch_size, m).type(torch.IntTensor).to(device)
ext_module.chamfer_distance_forward(xyz1, xyz2, dist1, dist2, idx1,
idx2)
ctx.save_for_backward(xyz1, xyz2, idx1, idx2)
return dist1, dist2, idx1, idx2
@staticmethod
@once_differentiable
def backward(ctx, grad_dist1: Tensor, grad_dist2: Tensor,
grad_idx1: Tensor,
grad_idx2: Tensor) -> Tuple[Tensor, Tensor]:
"""
Args:
grad_dist1 (Tensor): Gradient of chamfer distance
(xyz1 to xyz2) with shape (B, N).
grad_dist2 (Tensor): Gradient of chamfer distance
(xyz2 to xyz1) with shape (B, N).
grad_idx1 (Tensor): Index of chamfer distance (xyz1 to xyz2)
with shape (B, N), which be used in compute gradient.
grad_idx2 (Tensor): Index of chamfer distance (xyz2 to xyz2)
with shape (B, N), which be used in compute gradient.
Returns:
Tuple[Tensor, Tensor]:
- grad_xyz1 (Tensor): Gradient of the point set with shape \
(B, N, 2).
- grad_xyz2 (Tensor):Gradient of the point set with shape \
(B, N, 2).
"""
xyz1, xyz2, idx1, idx2 = ctx.saved_tensors
device = grad_dist1.device
grad_dist1 = grad_dist1.contiguous()
grad_dist2 = grad_dist2.contiguous()
grad_xyz1 = torch.zeros(xyz1.size()).to(device)
grad_xyz2 = torch.zeros(xyz2.size()).to(device)
ext_module.chamfer_distance_backward(xyz1, xyz2, grad_xyz1, grad_xyz2,
grad_dist1, grad_dist2, idx1,
idx2)
return grad_xyz1, grad_xyz2
chamfer_distance = ChamferDistanceFunction.apply
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Union
import numpy as np
import torch
......@@ -7,21 +9,22 @@ from ..utils import ext_loader
ext_module = ext_loader.load_ext('_ext', ['contour_expand'])
def contour_expand(kernel_mask, internal_kernel_label, min_kernel_area,
kernel_num):
def contour_expand(kernel_mask: Union[np.array, torch.Tensor],
internal_kernel_label: Union[np.array, torch.Tensor],
min_kernel_area: int, kernel_num: int) -> list:
"""Expand kernel contours so that foreground pixels are assigned into
instances.
Arguments:
kernel_mask (np.array or Tensor): The instance kernel mask with
Args:
kernel_mask (np.array or torch.Tensor): The instance kernel mask with
size hxw.
internal_kernel_label (np.array or Tensor): The instance internal
internal_kernel_label (np.array or torch.Tensor): The instance internal
kernel label with size hxw.
min_kernel_area (int): The minimum kernel area.
kernel_num (int): The instance kernel number.
Returns:
label (list): The instance index map with size hxw.
list: The instance index map with size hxw.
"""
assert isinstance(kernel_mask, (torch.Tensor, np.ndarray))
assert isinstance(internal_kernel_label, (torch.Tensor, np.ndarray))
......@@ -42,7 +45,7 @@ def contour_expand(kernel_mask, internal_kernel_label, min_kernel_area,
internal_kernel_label,
min_kernel_area=min_kernel_area,
kernel_num=kernel_num)
label = label.tolist()
label = label.tolist() # type: ignore
else:
label = ext_module.contour_expand(kernel_mask, internal_kernel_label,
min_kernel_area, kernel_num)
......
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Tuple
import torch
from ..utils import ext_loader
ext_module = ext_loader.load_ext('_ext', ['convex_iou', 'convex_giou'])
def convex_giou(pointsets: torch.Tensor,
polygons: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
"""Return generalized intersection-over-union (Jaccard index) between point
sets and polygons.
Args:
pointsets (torch.Tensor): It has shape (N, 18),
indicating (x1, y1, x2, y2, ..., x9, y9) for each row.
polygons (torch.Tensor): It has shape (N, 8),
indicating (x1, y1, x2, y2, x3, y3, x4, y4) for each row.
Returns:
tuple[torch.Tensor, torch.Tensor]: The first element is the gious
between point sets and polygons with the shape (N,). The second
element is the gradient of point sets with the shape (N, 18).
"""
output = pointsets.new_zeros((pointsets.size(0), 19))
ext_module.convex_giou(pointsets, polygons, output)
convex_giou = output[:, -1]
points_grad = output[:, 0:-1]
return convex_giou, points_grad
def convex_iou(pointsets: torch.Tensor,
polygons: torch.Tensor) -> torch.Tensor:
"""Return intersection-over-union (Jaccard index) between point sets and
polygons.
Args:
pointsets (torch.Tensor): It has shape (N, 18),
indicating (x1, y1, x2, y2, ..., x9, y9) for each row.
polygons (torch.Tensor): It has shape (K, 8),
indicating (x1, y1, x2, y2, x3, y3, x4, y4) for each row.
Returns:
torch.Tensor: Return the ious between point sets and polygons with the
shape (N, K).
"""
N, K = pointsets.size(0), polygons.size(0)
ious = pointsets.new_zeros((N, K))
ext_module.convex_iou(pointsets, polygons, ious)
return ious
# Copyright (c) OpenMMLab. All rights reserved.
import torch
from torch import nn
from torch import Tensor, nn
from torch.autograd import Function
from ..utils import ext_loader
_mode_dict = {'top': 0, 'bottom': 1, 'left': 2, 'right': 3}
ext_module = ext_loader.load_ext('_ext', [
'top_pool_forward', 'top_pool_backward', 'bottom_pool_forward',
'bottom_pool_backward', 'left_pool_forward', 'left_pool_backward',
'right_pool_forward', 'right_pool_backward'
])
_mode_dict = {'top': 0, 'bottom': 1, 'left': 2, 'right': 3}
def _corner_pool(x: Tensor, dim: int, flip: bool) -> Tensor:
size = x.size(dim)
output = x.clone()
ind = 1
while ind < size:
if flip:
cur_start = 0
cur_len = size - ind
next_start = ind
next_len = size - ind
else:
cur_start = ind
cur_len = size - ind
next_start = 0
next_len = size - ind
# max_temp should be cloned for backward computation
max_temp = output.narrow(dim, cur_start, cur_len).clone()
cur_temp = output.narrow(dim, cur_start, cur_len)
next_temp = output.narrow(dim, next_start, next_len)
cur_temp[...] = torch.where(max_temp > next_temp, max_temp, next_temp)
ind = ind << 1
return output
class TopPoolFunction(Function):
@staticmethod
def symbolic(g, input):
def symbolic(g, input: Tensor) -> Tensor:
output = g.op(
'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['top']))
return output
@staticmethod
def forward(ctx, input):
output = ext_module.top_pool_forward(input)
ctx.save_for_backward(input)
return output
@staticmethod
def backward(ctx, grad_output):
input, = ctx.saved_tensors
output = ext_module.top_pool_backward(input, grad_output)
return output
def forward(ctx, input: Tensor) -> Tensor:
return _corner_pool(input, 2, True)
class BottomPoolFunction(Function):
@staticmethod
def symbolic(g, input):
def symbolic(g, input: Tensor) -> Tensor:
output = g.op(
'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['bottom']))
return output
@staticmethod
def forward(ctx, input):
output = ext_module.bottom_pool_forward(input)
ctx.save_for_backward(input)
return output
@staticmethod
def backward(ctx, grad_output):
input, = ctx.saved_tensors
output = ext_module.bottom_pool_backward(input, grad_output)
return output
def forward(ctx, input: Tensor) -> Tensor:
return _corner_pool(input, 2, False)
class LeftPoolFunction(Function):
@staticmethod
def symbolic(g, input):
def symbolic(g, input: Tensor) -> Tensor:
output = g.op(
'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['left']))
return output
@staticmethod
def forward(ctx, input):
output = ext_module.left_pool_forward(input)
ctx.save_for_backward(input)
return output
@staticmethod
def backward(ctx, grad_output):
input, = ctx.saved_tensors
output = ext_module.left_pool_backward(input, grad_output)
return output
def forward(ctx, input: Tensor) -> Tensor:
return _corner_pool(input, 3, True)
class RightPoolFunction(Function):
@staticmethod
def symbolic(g, input):
def symbolic(g, input: Tensor) -> Tensor:
output = g.op(
'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['right']))
return output
@staticmethod
def forward(ctx, input):
output = ext_module.right_pool_forward(input)
ctx.save_for_backward(input)
return output
@staticmethod
def backward(ctx, grad_output):
input, = ctx.saved_tensors
output = ext_module.right_pool_backward(input, grad_output)
return output
def forward(ctx, input: Tensor) -> Tensor:
return _corner_pool(input, 3, False)
class CornerPool(nn.Module):
......@@ -104,11 +93,13 @@ class CornerPool(nn.Module):
Corner Pooling is a new type of pooling layer that helps a
convolutional network better localize corners of bounding boxes.
Please refer to https://arxiv.org/abs/1808.01244 for more details.
Please refer to `CornerNet: Detecting Objects as Paired Keypoints
<https://arxiv.org/abs/1808.01244>`_ for more details.
Code is modified from https://github.com/princeton-vl/CornerNet-Lite.
Args:
mode(str): Pooling orientation for the pooling layer
mode (str): Pooling orientation for the pooling layer
- 'bottom': Bottom Pooling
- 'left': Left Pooling
......@@ -133,13 +124,13 @@ class CornerPool(nn.Module):
'top': (2, True),
}
def __init__(self, mode):
super(CornerPool, self).__init__()
def __init__(self, mode: str):
super().__init__()
assert mode in self.pool_functions
self.mode = mode
self.corner_pool = self.pool_functions[mode]
self.corner_pool: Function = self.pool_functions[mode]
def forward(self, x):
def forward(self, x: Tensor) -> Tensor:
if torch.__version__ != 'parrots' and torch.__version__ >= '1.5.0':
if torch.onnx.is_in_onnx_export():
assert torch.__version__ >= '1.7.0', \
......@@ -158,4 +149,8 @@ class CornerPool(nn.Module):
pool_tensor = pool_tensor.flip(dim)
return pool_tensor
else:
return self.corner_pool.apply(x)
if torch.onnx.is_in_onnx_export():
return self.corner_pool.apply(x)
else:
dim, flip = self.cummax_dim_flip[self.mode]
return _corner_pool(x, dim, flip)
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Tuple
import torch
from torch import Tensor, nn
from torch.autograd import Function
......@@ -15,14 +17,14 @@ class CorrelationFunction(Function):
@staticmethod
def forward(ctx,
input1,
input2,
kernel_size=1,
max_displacement=1,
stride=1,
padding=1,
dilation=1,
dilation_patch=1):
input1: Tensor,
input2: Tensor,
kernel_size: int = 1,
max_displacement: int = 1,
stride: int = 1,
padding: int = 1,
dilation: int = 1,
dilation_patch: int = 1) -> Tensor:
ctx.save_for_backward(input1, input2)
......@@ -60,7 +62,9 @@ class CorrelationFunction(Function):
@staticmethod
@once_differentiable
def backward(ctx, grad_output):
def backward(
ctx, grad_output: Tensor
) -> Tuple[Tensor, Tensor, None, None, None, None, None, None]:
input1, input2 = ctx.saved_tensors
kH, kW = ctx.kernel_size
......
......@@ -13,11 +13,19 @@ This folder contains all non-python code for MMCV custom ops. Please follow the
│ ├── pytorch_cpp_helper.hpp
│ ├── pytorch_cuda_helper.hpp
│ ├── pytorch_device_registry.hpp
│   └── cuda
│   ├── common_cuda_helper.hpp
│   ├── parrots_cudawarpfunction.cuh
│   ├── ...
│   └── ops_cuda_kernel.cuh
│   ├── cuda
│   │ ├── common_cuda_helper.hpp
│   │ ├── parrots_cudawarpfunction.cuh
│   │ ├── ...
│   │ └── ops_cuda_kernel.cuh
|   ├── mps
│   │ ├── MPSLibrary.h
│   │ ├── ...
│   │ └── MPSUtils.h
|   ├── mlu
│   │ └── ...
|   └── utils
│   │ └── ...
├── onnxruntime
│   ├── onnxruntime_register.h
│   ├── onnxruntime_session_options_config_keys.h
......@@ -41,9 +49,15 @@ This folder contains all non-python code for MMCV custom ops. Please follow the
│   ├── cuda
│   │   ├── ...
│   │   └── ops_cuda.cu
│   └── cpu
│   ├── cpu
│   │   ├── ...
│   │   └── ops.cpp
│   ├── mps
│   │   ├── ...
│   |   └── op_mps.mm
│   └── mlu
│      ├── ...
│      └── ops.cpp
│      └── op_mlu.cpp
└── tensorrt
├── trt_cuda_helper.cuh
├── trt_plugin_helper.hpp
......@@ -63,108 +77,113 @@ This folder contains all non-python code for MMCV custom ops. Please follow the
- `common`: This directory contains all tools and shared codes.
- `cuda`: The cuda kernels which can be shared by all backends. **HIP** kernel is also here since they have similar syntax.
- `onnxruntime`: **ONNX Runtime** support for custom ops.
- `mps`: The tools used to support MPS ops. **NOTE** that MPS support is **experimental**.
- `mlu`: The MLU kernels used to support [Cambricon](https://www.cambricon.com/) device.
- `utils`: The kernels and utils of spconv.
- `onnxruntime`: **ONNX Runtime** support for custom ops. Has been deprecated, please try the latest custom ops in [MMDeploy](https://github.com/open-mmlab/mmdeploy).
- `cpu`: CPU implementation of supported ops.
- `parrots`: **Parrots** is a deep learning frame for model training and inference. Parrots custom ops are placed in this directory.
- `pytorch`: **PyTorch** custom ops are supported by binding C++ to Python with **pybind11**. The ops implementation and binding codes are placed in this directory.
- `cuda`: This directory contains cuda kernel launchers, which feed memory pointers of tensor to the cuda kernel in `common/cuda`. The launchers provide c++ interface of cuda implementation of corresponding custom ops.
- `cpu`: This directory contain cpu implementations of corresponding custom ops.
- `tensorrt`: **TensorRT** support for custom ops.
- `mlu`: This directory contain launchers of each MLU kernels.
- `mps`: MPS ops implementation and launchers.
- `tensorrt`: **TensorRT** support for custom ops. Has been deprecated, please try the latest custom ops in [MMDeploy](https://github.com/open-mmlab/mmdeploy).
- `plugins`: This directory contains the implementation of the supported custom ops. Some ops might also use shared cuda kernel in `common/cuda`.
## How to add new PyTorch ops?
1. (Optional) Add shared kernel in `common` to support special hardware platform.
```c++
// src/common/cuda/new_ops_cuda_kernel.cuh
template <typename T>
__global__ void new_ops_forward_cuda_kernel(const T* input, T* output, ...) {
// forward here
}
```
Add cuda kernel launcher in `pytorch/cuda`.
```c++
// src/pytorch/cuda
#include <new_ops_cuda_kernel.cuh>
void NewOpsForwardCUDAKernelLauncher(Tensor input, Tensor output, ...){
// initialize
at::cuda::CUDAGuard device_guard(input.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
...
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "new_ops_forward_cuda_kernel", ([&] {
new_ops_forward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
input.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),...);
}));
AT_CUDA_CHECK(cudaGetLastError());
}
```
```c++
// src/common/cuda/new_ops_cuda_kernel.cuh
template <typename T>
__global__ void new_ops_forward_cuda_kernel(const T* input, T* output, ...) {
// forward here
}
```
Add cuda kernel launcher in `pytorch/cuda`.
```c++
// src/pytorch/cuda
#include <new_ops_cuda_kernel.cuh>
void NewOpsForwardCUDAKernelLauncher(Tensor input, Tensor output, ...){
// initialize
at::cuda::CUDAGuard device_guard(input.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
...
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "new_ops_forward_cuda_kernel", ([&] {
new_ops_forward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
input.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),...);
}));
AT_CUDA_CHECK(cudaGetLastError());
}
```
2. Register implementation for different devices.
```c++
// src/pytorch/cuda/cudabind.cpp
...
```c++
// src/pytorch/cuda/cudabind.cpp
...
Tensor new_ops_forward_cuda(Tensor input, Tensor output, ...){
// implement cuda forward here
// use `NewOpsForwardCUDAKernelLauncher` here
}
// declare interface here.
Tensor new_ops_forward_impl(Tensor input, Tensor output, ...);
// register the implementation for given device (CUDA here).
REGISTER_DEVICE_IMPL(new_ops_forward_impl, CUDA, new_ops_forward_cuda);
```
Tensor new_ops_forward_cuda(Tensor input, Tensor output, ...){
// implement cuda forward here
// use `NewOpsForwardCUDAKernelLauncher` here
}
// declare interface here.
Tensor new_ops_forward_impl(Tensor input, Tensor output, ...);
// register the implementation for given device (CUDA here).
REGISTER_DEVICE_IMPL(new_ops_forward_impl, CUDA, new_ops_forward_cuda);
```
3. Add ops implementation in `pytorch` directory. Select different implementations according to device type.
```c++
// src/pytorch/new_ops.cpp
Tensor new_ops_forward_impl(Tensor input, Tensor output, ...){
// dispatch the implementation according to the device type of input.
DISPATCH_DEVICE_IMPL(new_ops_forward_impl, input, output, ...);
}
...
```c++
// src/pytorch/new_ops.cpp
Tensor new_ops_forward_impl(Tensor input, Tensor output, ...){
// dispatch the implementation according to the device type of input.
DISPATCH_DEVICE_IMPL(new_ops_forward_impl, input, output, ...);
}
...
Tensor new_ops_forward(Tensor input, Tensor output, ...){
return new_ops_forward_impl(input, output, ...);
}
```
Tensor new_ops_forward(Tensor input, Tensor output, ...){
return new_ops_forward_impl(input, output, ...);
}
```
4. Binding the implementation in `pytorch/pybind.cpp`
```c++
// src/pytorch/pybind.cpp
```c++
// src/pytorch/pybind.cpp
...
...
Tensor new_ops_forward(Tensor input, Tensor output, ...);
Tensor new_ops_forward(Tensor input, Tensor output, ...);
...
...
// bind with pybind11
m.def("new_ops_forward", &new_ops_forward, "new_ops_forward",
py::arg("input"), py::arg("output"), ...);
// bind with pybind11
m.def("new_ops_forward", &new_ops_forward, "new_ops_forward",
py::arg("input"), py::arg("output"), ...);
...
...
```
```
5. Build MMCV again. Enjoy new ops in python
```python
from ..utils import ext_loader
ext_module = ext_loader.load_ext('_ext', ['new_ops_forward'])
```python
from ..utils import ext_loader
ext_module = ext_loader.load_ext('_ext', ['new_ops_forward'])
...
...
ext_module.new_ops_forward(input, output, ...)
ext_module.new_ops_forward(input, output, ...)
```
```
......@@ -220,6 +220,10 @@ HOST_DEVICE_INLINE int convex_hull_graham(const Point<T> (&p)[24],
return temp > 0;
}
});
// compute distance to origin after sort, since the points are now different.
for (int i = 0; i < num_in; i++) {
dist[i] = dot_2d<T>(q[i], q[i]);
}
#endif
// Step 4:
......
// Copyright (c) OpenMMLab. All rights reserved.
// Modified from
// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/cuda/ActiveRotatingFilter_cuda.cu
#ifndef ACTIVE_ROTATED_FILTER_CUDA_KERNEL_CUH
#define ACTIVE_ROTATED_FILTER_CUDA_KERNEL_CUH
#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif
template <typename scalar_t>
__global__ void active_rotated_filter_forward_cuda_kernel(
const int nthreads, const scalar_t* weight_data, const int* indices_data,
const int num_input_planes, const int num_output_planes,
const int num_orientations, const int num_rotations, const int nEntry,
scalar_t* output_data) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
int l = index % nEntry;
int j = (index / nEntry) % num_input_planes;
int i = index / nEntry / num_input_planes;
int k;
scalar_t val = *(weight_data + index);
for (k = 0; k < num_rotations; k++) {
int idx = (int)(*(indices_data + l * num_rotations + k)) - 1;
scalar_t* target = output_data +
i * (num_rotations * num_input_planes * nEntry) +
k * (num_input_planes * nEntry) + j * (nEntry) + idx;
*target = val;
}
}
}
template <typename scalar_t>
__global__ void active_rotated_filter_backward_cuda_kernel(
const int nthreads, const scalar_t* gradWeight_data,
const int* indices_data, const int num_input_planes,
const int num_output_planes, const int num_orientations,
const int num_rotations, const int nEntry, scalar_t* weight_data) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
int l = index % nEntry;
int j = (index / nEntry) % num_input_planes;
int i = index / nEntry / num_input_planes;
int k;
scalar_t* val = weight_data + index;
*val = 0;
scalar_t tmp = 0;
for (k = 0; k < num_rotations; k++) {
int idx = (int)(*(indices_data + l * num_rotations + k)) - 1;
scalar_t target =
*(gradWeight_data + i * (num_rotations * num_input_planes * nEntry) +
k * (num_input_planes * nEntry) + j * (nEntry) + idx);
tmp = tmp + target;
}
*val = tmp;
}
}
#endif // ACTIVE_ROTATED_FILTER_CUDA_KERNEL_CUH
......@@ -22,34 +22,34 @@ __global__ void assign_score_withk_forward_cuda_kernel(
const int O, const int aggregate, const T* points, const T* centers,
const T* scores, const int64_t* knn_idx, T* output) {
// ----- parallel loop for B, N1, K and O ---------
long i = blockIdx.x * blockDim.x + threadIdx.x;
if (i >= B * N1 * K * O) return;
// ------- loop for M ----------
const int b = (int)(i / (O * N1 * K));
const int o = (int)(i % (O * N1 * K) / (N1 * K));
const int n = (int)(i % (N1 * K) / K);
const int k = (int)(i % K);
const int cn = (int)knn_idx[b * K * N1 + n * K +
0]; // The first neighbor is the center point
const int kn = (int)knn_idx[b * K * N1 + n * K + k];
if (kn >= N0 ||
kn < 0) { // if index overflows, it is out of the neighborhood range
return;
}
assert(b < B);
assert(kn < N0);
assert(cn < N0);
assert(o < O);
assert(n < N1);
const int out_idx = b * N1 * O * K + o * N1 * K + n * K + k;
T val = output[out_idx];
for (int m = 0; m < M; m++) {
val += points[b * N0 * M * O + kn * M * O + m * O + o] *
scores[b * N1 * K * M + n * K * M + k * M + m] -
centers[b * N0 * M * O + cn * M * O + m * O + o] *
scores[b * N1 * K * M + n * K * M + k * M + m];
CUDA_1D_KERNEL_LOOP(i, B * O * N1 * K) {
// ------- loop for M ----------
const int b = (int)(i / (O * N1 * K));
const int o = (int)(i % (O * N1 * K) / (N1 * K));
const int n = (int)(i % (N1 * K) / K);
const int k = (int)(i % K);
const int cn = (int)knn_idx[b * K * N1 + n * K +
0]; // The first neighbor is the center point
const int kn = (int)knn_idx[b * K * N1 + n * K + k];
if (kn >= N0 ||
kn < 0) { // if index overflows, it is out of the neighborhood range
return;
}
assert(b < B);
assert(kn < N0);
assert(cn < N0);
assert(o < O);
assert(n < N1);
const int out_idx = b * N1 * O * K + o * N1 * K + n * K + k;
T val = output[out_idx];
for (int m = 0; m < M; m++) {
val += points[b * N0 * M * O + kn * M * O + m * O + o] *
scores[b * N1 * K * M + n * K * M + k * M + m] -
centers[b * N0 * M * O + cn * M * O + m * O + o] *
scores[b * N1 * K * M + n * K * M + k * M + m];
}
output[out_idx] = val;
}
output[out_idx] = val;
}
template <typename T>
......@@ -58,27 +58,27 @@ __global__ void assign_score_withk_points_backward_cuda_kernel(
const int O, const int aggregate, const T* grad_out, const T* scores,
const int64_t* knn_idx, T* grad_points, T* grad_centers) {
// ----- parallel loop for B, M, O ---------
long i = blockIdx.x * blockDim.x + threadIdx.x;
if (i >= B * M * O) return;
int b = (int)(i / (M * O));
int m = (int)(i % (M * O) / O);
int o = (int)(i % O);
CUDA_1D_KERNEL_LOOP(i, B * M * O) {
int b = (int)(i / (M * O));
int m = (int)(i % (M * O) / O);
int o = (int)(i % O);
// ----- loop for N,K ---------
for (int n = 0; n < N; n++) {
for (int k = 0; k < K; k++) {
int kn = knn_idx[b * N * K + n * K + k];
int cn = knn_idx[b * N * K + n * K + 0];
if (kn >= N0 ||
kn < 0) { // if index overflows, it is out of the neighborhood range
continue;
// ----- loop for N,K ---------
for (int n = 0; n < N; n++) {
for (int k = 0; k < K; k++) {
int kn = knn_idx[b * N * K + n * K + k];
int cn = knn_idx[b * N * K + n * K + 0];
if (kn >= N0 || kn < 0) { // if index overflows, it is out of the
// neighborhood range
continue;
}
atomicAdd(grad_points + b * N0 * M * O + kn * M * O + m * O + o,
scores[b * N * K * M + n * K * M + k * M + m] *
grad_out[b * O * N * K + o * N * K + n * K + k]);
atomicAdd(grad_centers + b * N0 * M * O + cn * M * O + m * O + o,
-scores[b * N * K * M + n * K * M + k * M + m] *
grad_out[b * O * N * K + o * N * K + n * K + k]);
}
atomicAdd(grad_points + b * N0 * M * O + kn * M * O + m * O + o,
scores[b * N * K * M + n * K * M + k * M + m] *
grad_out[b * O * N * K + o * N * K + n * K + k]);
atomicAdd(grad_centers + b * N0 * M * O + cn * M * O + m * O + o,
-scores[b * N * K * M + n * K * M + k * M + m] *
grad_out[b * O * N * K + o * N * K + n * K + k]);
}
}
}
......@@ -89,28 +89,28 @@ __global__ void assign_score_withk_scores_backward_cuda_kernel(
const int O, const int aggregate, const T* grad_out, const T* points,
const T* centers, const int64_t* knn_idx, T* grad_scores) {
// ----- parallel loop for B, N, K, M ---------
long i = blockIdx.x * blockDim.x + threadIdx.x;
if (i >= B * N * K * M) return;
const int b = (int)(i / (N * M * K));
const int n = (int)(i % (N * M * K) / M / K);
const int k = (int)(i % (M * K) / M);
const int m = (int)(i % M);
const int cn = knn_idx[b * N * K + n * K + 0];
const int kn = knn_idx[b * N * K + n * K + k];
if (kn >= N0 ||
kn < 0) { // if index overflows, it is out of the neighborhood range
return;
}
CUDA_1D_KERNEL_LOOP(i, B * N * K * M) {
const int b = (int)(i / (N * M * K));
const int n = (int)(i % (N * M * K) / M / K);
const int k = (int)(i % (M * K) / M);
const int m = (int)(i % M);
const int cn = knn_idx[b * N * K + n * K + 0];
const int kn = knn_idx[b * N * K + n * K + k];
if (kn >= N0 ||
kn < 0) { // if index overflows, it is out of the neighborhood range
return;
}
// -------------- loop for O ------------------------
const int out_idx = b * N * K * M + n * K * M + k * M + m;
T val = grad_scores[out_idx];
for (int o = 0; o < O; o++) {
val += (points[b * N0 * M * O + kn * M * O + m * O + o] -
centers[b * N0 * M * O + cn * M * O + m * O + o]) *
grad_out[b * O * N * K + o * N * K + n * K + k];
// -------------- loop for O ------------------------
const int out_idx = b * N * K * M + n * K * M + k * M + m;
T val = grad_scores[out_idx];
for (int o = 0; o < O; o++) {
val += (points[b * N0 * M * O + kn * M * O + m * O + o] -
centers[b * N0 * M * O + cn * M * O + m * O + o]) *
grad_out[b * O * N * K + o * N * K + n * K + k];
}
grad_scores[out_idx] = val;
}
grad_scores[out_idx] = val;
}
#endif // ASSIGN_SCORE_WITHK_CUDA_KERNEL_CUH
......@@ -21,35 +21,36 @@ __global__ void ball_query_forward_cuda_kernel(int b, int n, int m,
// output:
// idx: (B, M, nsample)
int bs_idx = blockIdx.y;
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (bs_idx >= b || pt_idx >= m) return;
CUDA_1D_KERNEL_LOOP(pt_idx, m) {
if (bs_idx >= b) return;
new_xyz += bs_idx * m * 3 + pt_idx * 3;
xyz += bs_idx * n * 3;
idx += bs_idx * m * nsample + pt_idx * nsample;
new_xyz += bs_idx * m * 3 + pt_idx * 3;
xyz += bs_idx * n * 3;
idx += bs_idx * m * nsample + pt_idx * nsample;
float max_radius2 = max_radius * max_radius;
float min_radius2 = min_radius * min_radius;
T new_x = new_xyz[0];
T new_y = new_xyz[1];
T new_z = new_xyz[2];
float max_radius2 = max_radius * max_radius;
float min_radius2 = min_radius * min_radius;
T new_x = new_xyz[0];
T new_y = new_xyz[1];
T new_z = new_xyz[2];
int cnt = 0;
for (int k = 0; k < n; ++k) {
T x = xyz[k * 3 + 0];
T y = xyz[k * 3 + 1];
T z = xyz[k * 3 + 2];
T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +
(new_z - z) * (new_z - z);
if (d2 == 0 || (d2 >= min_radius2 && d2 < max_radius2)) {
if (cnt == 0) {
for (int l = 0; l < nsample; ++l) {
idx[l] = k;
int cnt = 0;
for (int k = 0; k < n; ++k) {
T x = xyz[k * 3 + 0];
T y = xyz[k * 3 + 1];
T z = xyz[k * 3 + 2];
T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +
(new_z - z) * (new_z - z);
if (d2 == 0 || (d2 >= min_radius2 && d2 < max_radius2)) {
if (cnt == 0) {
for (int l = 0; l < nsample; ++l) {
idx[l] = k;
}
}
idx[cnt] = k;
++cnt;
if (cnt >= nsample) break;
}
idx[cnt] = k;
++cnt;
if (cnt >= nsample) break;
}
}
}
......
......@@ -8,6 +8,27 @@
#include "pytorch_cuda_helper.hpp"
#endif
template <typename T>
__device__ __forceinline__ void load_bbox(const T* bbox, const int base, T& x1,
T& y1, T& x2, T& y2) {
x1 = bbox[base];
y1 = bbox[base + 1];
x2 = bbox[base + 2];
y2 = bbox[base + 3];
}
template <>
__device__ __forceinline__ void load_bbox<float>(const float* bbox,
const int base, float& x1,
float& y1, float& x2,
float& y2) {
const float4 bbox_offset = reinterpret_cast<const float4*>(bbox + base)[0];
x1 = bbox_offset.x;
y1 = bbox_offset.y;
x2 = bbox_offset.z;
y2 = bbox_offset.w;
}
template <typename T>
__global__ void bbox_overlaps_cuda_kernel(const T* bbox1, const T* bbox2,
T* ious, const int num_bbox1,
......@@ -16,69 +37,111 @@ __global__ void bbox_overlaps_cuda_kernel(const T* bbox1, const T* bbox2,
const int offset) {
if (aligned) {
CUDA_1D_KERNEL_LOOP(index, num_bbox1) {
int b1 = index;
int b2 = index;
int base1 = b1 * 4;
T b1_x1 = bbox1[base1];
T b1_y1 = bbox1[base1 + 1];
T b1_x2 = bbox1[base1 + 2];
T b1_y2 = bbox1[base1 + 3];
T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset);
int base2 = b2 * 4;
T b2_x1 = bbox2[base2];
T b2_y1 = bbox2[base2 + 1];
T b2_x2 = bbox2[base2 + 2];
T b2_y2 = bbox2[base2 + 3];
T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset);
T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2);
T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2);
T width = fmaxf(right - left + offset, 0.f);
T height = fmaxf(bottom - top + offset, 0.f);
T interS = width * height;
T baseS = 1.0;
if (mode == 0) {
baseS = fmaxf(b1_area + b2_area - interS, T(offset));
} else if (mode == 1) {
baseS = fmaxf(b1_area, T(offset));
}
const int b1 = index;
const int b2 = index;
const int base1 = b1 << 2; // b1 * 4
T b1_x1, b1_y1, b1_x2, b1_y2;
load_bbox<T>(bbox1, base1, b1_x1, b1_y1, b1_x2, b1_y2);
const T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset);
const int base2 = b2 << 2; // b2 * 4
T b2_x1, b2_y1, b2_x2, b2_y2;
load_bbox<T>(bbox2, base2, b2_x1, b2_y1, b2_x2, b2_y2);
const T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset);
const T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2);
const T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2);
const T width = fmaxf(right - left + offset, 0.f);
const T height = fmaxf(bottom - top + offset, 0.f);
const T interS = width * height;
const T baseS =
fmaxf(mode == 0 ? b1_area + b2_area - interS : b1_area, T(offset));
ious[index] = interS / baseS;
}
} else {
CUDA_1D_KERNEL_LOOP(index, num_bbox1 * num_bbox2) {
int b1 = index / num_bbox2;
int b2 = index % num_bbox2;
int base1 = b1 * 4;
T b1_x1 = bbox1[base1];
T b1_y1 = bbox1[base1 + 1];
T b1_x2 = bbox1[base1 + 2];
T b1_y2 = bbox1[base1 + 3];
T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset);
int base2 = b2 * 4;
T b2_x1 = bbox2[base2];
T b2_y1 = bbox2[base2 + 1];
T b2_x2 = bbox2[base2 + 2];
T b2_y2 = bbox2[base2 + 3];
T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset);
T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2);
T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2);
T width = fmaxf(right - left + offset, 0.f);
T height = fmaxf(bottom - top + offset, 0.f);
T interS = width * height;
T baseS = 1.0;
if (mode == 0) {
baseS = fmaxf(b1_area + b2_area - interS, T(offset));
} else if (mode == 1) {
baseS = fmaxf(b1_area, T(offset));
}
const int b1 = index / num_bbox2;
const int b2 = index % num_bbox2;
const int base1 = b1 << 2; // b1 * 4
T b1_x1, b1_y1, b1_x2, b1_y2;
load_bbox<T>(bbox1, base1, b1_x1, b1_y1, b1_x2, b1_y2);
const T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset);
const int base2 = b2 << 2; // b2 * 4
T b2_x1, b2_y1, b2_x2, b2_y2;
load_bbox<T>(bbox2, base2, b2_x1, b2_y1, b2_x2, b2_y2);
const T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset);
const T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2);
const T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2);
const T width = fmaxf(right - left + offset, 0.f);
const T height = fmaxf(bottom - top + offset, 0.f);
const T interS = width * height;
const T baseS =
fmaxf(mode == 0 ? b1_area + b2_area - interS : b1_area, T(offset));
ious[index] = interS / baseS;
}
}
}
#if __CUDA_ARCH__ >= 530
__device__ __forceinline__ __half __half_area(const __half x1, const __half y1,
const __half x2, const __half y2,
const __half offset) {
const __half half_w = __hadd(__hsub(x2, x1), offset);
const __half half_h = __hadd(__hsub(y2, y1), offset);
return __hmul(half_w, half_h);
}
__device__ __forceinline__ __half __half_max(const __half a, const __half b) {
return __hge(a, b) ? a : b;
}
__device__ __forceinline__ __half __half_min(const __half a, const __half b) {
return __hle(a, b) ? a : b;
}
// fp16 won't provide much increase when aligned==true. It is useful when
// aligned==false, which would give you ~40% bonus.
__device__ void bbox_overlaps_cuda_kernel_half(
const __half* bbox1, const __half* bbox2, __half* ious, const int num_bbox1,
const int num_bbox2, const int mode, const bool aligned, const int offset) {
const int num_output = aligned ? num_bbox1 : num_bbox1 * num_bbox2;
const __half h_offset = __int2half_rn(offset);
CUDA_1D_KERNEL_LOOP(index, num_output) {
const int b1 = aligned ? index : index / num_bbox2;
const int b2 = aligned ? index : index % num_bbox2;
const int base1 = b1 << 2;
__half b1_x1, b1_y1, b1_x2, b1_y2;
load_bbox<__half>(bbox1, base1, b1_x1, b1_y1, b1_x2, b1_y2);
const __half b1_area = __half_area(b1_x1, b1_y1, b1_x2, b1_y2, h_offset);
const int base2 = b2 << 2;
__half b2_x1, b2_y1, b2_x2, b2_y2;
load_bbox<__half>(bbox2, base2, b2_x1, b2_y1, b2_x2, b2_y2);
const __half b2_area = __half_area(b2_x1, b2_y1, b2_x2, b2_y2, h_offset);
const __half left = __half_max(b1_x1, b2_x1),
right = __half_min(b1_x2, b2_x2);
const __half top = __half_max(b1_y1, b2_y1),
bottom = __half_min(b1_y2, b2_y2);
const __half width =
__half_max(__hadd(__hsub(right, left), h_offset), __float2half(0.f));
const __half height =
__half_max(__hadd(__hsub(bottom, top), h_offset), __float2half(0.f));
const __half interS = __hmul(width, height);
const __half baseS = __half_max(
mode == 0 ? __hsub(__hadd(b1_area, b2_area), interS) : b1_area,
h_offset);
ious[index] = __hdiv(interS, baseS);
}
}
#endif // __CUDA_ARCH__ >= 530
#endif // BBOX_OVERLAPS_CUDA_KERNEL_CUH
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment