"docs/zh_cn/conf.py" did not exist on "da39212f94a3d433cf99e55cfecd77ebf359e0ca"
Commit 91da9643 authored by limm's avatar limm
Browse files

support v2.1.0

parent 6f674c7e
# Copyright (c) OpenMMLab. All rights reserved. # Copyright (c) OpenMMLab. All rights reserved.
import warnings import warnings
from functools import partial
from typing import Dict, Optional, Tuple, Union from typing import Dict, Optional, Tuple, Union
import torch import torch
...@@ -14,6 +15,56 @@ from .norm import build_norm_layer ...@@ -14,6 +15,56 @@ from .norm import build_norm_layer
from .padding import build_padding_layer from .padding import build_padding_layer
def efficient_conv_bn_eval_forward(bn: _BatchNorm,
conv: nn.modules.conv._ConvNd,
x: torch.Tensor):
"""
Implementation based on https://arxiv.org/abs/2305.11624
"Tune-Mode ConvBN Blocks For Efficient Transfer Learning"
It leverages the associative law between convolution and affine transform,
i.e., normalize (weight conv feature) = (normalize weight) conv feature.
It works for Eval mode of ConvBN blocks during validation, and can be used
for training as well. It reduces memory and computation cost.
Args:
bn (_BatchNorm): a BatchNorm module.
conv (nn._ConvNd): a conv module
x (torch.Tensor): Input feature map.
"""
# These lines of code are designed to deal with various cases
# like bn without affine transform, and conv without bias
weight_on_the_fly = conv.weight
if conv.bias is not None:
bias_on_the_fly = conv.bias
else:
bias_on_the_fly = torch.zeros_like(bn.running_var)
if bn.weight is not None:
bn_weight = bn.weight
else:
bn_weight = torch.ones_like(bn.running_var)
if bn.bias is not None:
bn_bias = bn.bias
else:
bn_bias = torch.zeros_like(bn.running_var)
# shape of [C_out, 1, 1, 1] in Conv2d
weight_coeff = torch.rsqrt(bn.running_var +
bn.eps).reshape([-1] + [1] *
(len(conv.weight.shape) - 1))
# shape of [C_out, 1, 1, 1] in Conv2d
coefff_on_the_fly = bn_weight.view_as(weight_coeff) * weight_coeff
# shape of [C_out, C_in, k, k] in Conv2d
weight_on_the_fly = weight_on_the_fly * coefff_on_the_fly
# shape of [C_out] in Conv2d
bias_on_the_fly = bn_bias + coefff_on_the_fly.flatten() *\
(bias_on_the_fly - bn.running_mean)
return conv._conv_forward(x, weight_on_the_fly, bias_on_the_fly)
@MODELS.register_module() @MODELS.register_module()
class ConvModule(nn.Module): class ConvModule(nn.Module):
"""A conv block that bundles conv/norm/activation layers. """A conv block that bundles conv/norm/activation layers.
...@@ -65,6 +116,9 @@ class ConvModule(nn.Module): ...@@ -65,6 +116,9 @@ class ConvModule(nn.Module):
sequence of "conv", "norm" and "act". Common examples are sequence of "conv", "norm" and "act". Common examples are
("conv", "norm", "act") and ("act", "conv", "norm"). ("conv", "norm", "act") and ("act", "conv", "norm").
Default: ('conv', 'norm', 'act'). Default: ('conv', 'norm', 'act').
efficient_conv_bn_eval (bool): Whether use efficient conv when the
consecutive bn is in eval mode (either training or testing), as
proposed in https://arxiv.org/abs/2305.11624 . Default: `False`.
""" """
_abbr_ = 'conv_block' _abbr_ = 'conv_block'
...@@ -84,7 +138,8 @@ class ConvModule(nn.Module): ...@@ -84,7 +138,8 @@ class ConvModule(nn.Module):
inplace: bool = True, inplace: bool = True,
with_spectral_norm: bool = False, with_spectral_norm: bool = False,
padding_mode: str = 'zeros', padding_mode: str = 'zeros',
order: tuple = ('conv', 'norm', 'act')): order: tuple = ('conv', 'norm', 'act'),
efficient_conv_bn_eval: bool = False):
super().__init__() super().__init__()
assert conv_cfg is None or isinstance(conv_cfg, dict) assert conv_cfg is None or isinstance(conv_cfg, dict)
assert norm_cfg is None or isinstance(norm_cfg, dict) assert norm_cfg is None or isinstance(norm_cfg, dict)
...@@ -155,6 +210,8 @@ class ConvModule(nn.Module): ...@@ -155,6 +210,8 @@ class ConvModule(nn.Module):
else: else:
self.norm_name = None # type: ignore self.norm_name = None # type: ignore
self.turn_on_efficient_conv_bn_eval(efficient_conv_bn_eval)
# build activation layer # build activation layer
if self.with_activation: if self.with_activation:
act_cfg_ = act_cfg.copy() # type: ignore act_cfg_ = act_cfg.copy() # type: ignore
...@@ -200,13 +257,82 @@ class ConvModule(nn.Module): ...@@ -200,13 +257,82 @@ class ConvModule(nn.Module):
x: torch.Tensor, x: torch.Tensor,
activate: bool = True, activate: bool = True,
norm: bool = True) -> torch.Tensor: norm: bool = True) -> torch.Tensor:
for layer in self.order: layer_index = 0
while layer_index < len(self.order):
layer = self.order[layer_index]
if layer == 'conv': if layer == 'conv':
if self.with_explicit_padding: if self.with_explicit_padding:
x = self.padding_layer(x) x = self.padding_layer(x)
# if the next operation is norm and we have a norm layer in
# eval mode and we have enabled `efficient_conv_bn_eval` for
# the conv operator, then activate the optimized forward and
# skip the next norm operator since it has been fused
if layer_index + 1 < len(self.order) and \
self.order[layer_index + 1] == 'norm' and norm and \
self.with_norm and not self.norm.training and \
self.efficient_conv_bn_eval_forward is not None:
self.conv.forward = partial(
self.efficient_conv_bn_eval_forward, self.norm,
self.conv)
layer_index += 1
x = self.conv(x)
del self.conv.forward
else:
x = self.conv(x) x = self.conv(x)
elif layer == 'norm' and norm and self.with_norm: elif layer == 'norm' and norm and self.with_norm:
x = self.norm(x) x = self.norm(x)
elif layer == 'act' and activate and self.with_activation: elif layer == 'act' and activate and self.with_activation:
x = self.activate(x) x = self.activate(x)
layer_index += 1
return x return x
def turn_on_efficient_conv_bn_eval(self, efficient_conv_bn_eval=True):
# efficient_conv_bn_eval works for conv + bn
# with `track_running_stats` option
if efficient_conv_bn_eval and self.norm \
and isinstance(self.norm, _BatchNorm) \
and self.norm.track_running_stats:
self.efficient_conv_bn_eval_forward = efficient_conv_bn_eval_forward # noqa: E501
else:
self.efficient_conv_bn_eval_forward = None # type: ignore
@staticmethod
def create_from_conv_bn(conv: torch.nn.modules.conv._ConvNd,
bn: torch.nn.modules.batchnorm._BatchNorm,
efficient_conv_bn_eval=True) -> 'ConvModule':
"""Create a ConvModule from a conv and a bn module."""
self = ConvModule.__new__(ConvModule)
super(ConvModule, self).__init__()
self.conv_cfg = None
self.norm_cfg = None
self.act_cfg = None
self.inplace = False
self.with_spectral_norm = False
self.with_explicit_padding = False
self.order = ('conv', 'norm', 'act')
self.with_norm = True
self.with_activation = False
self.with_bias = conv.bias is not None
# build convolution layer
self.conv = conv
# export the attributes of self.conv to a higher level for convenience
self.in_channels = self.conv.in_channels
self.out_channels = self.conv.out_channels
self.kernel_size = self.conv.kernel_size
self.stride = self.conv.stride
self.padding = self.conv.padding
self.dilation = self.conv.dilation
self.transposed = self.conv.transposed
self.output_padding = self.conv.output_padding
self.groups = self.conv.groups
# build normalization layers
self.norm_name, norm = 'bn', bn
self.add_module(self.norm_name, norm)
self.turn_on_efficient_conv_bn_eval(efficient_conv_bn_eval)
return self
...@@ -371,7 +371,7 @@ class GeneralizedAttention(nn.Module): ...@@ -371,7 +371,7 @@ class GeneralizedAttention(nn.Module):
contiguous().\ contiguous().\
view(1, 1, h*w, h_kv*w_kv) view(1, 1, h*w, h_kv*w_kv)
energy = energy.masked_fill_(cur_local_constraint_map, energy = energy.masked_fill_(cur_local_constraint_map.bool(),
float('-inf')) float('-inf'))
attention = F.softmax(energy, 3) attention = F.softmax(energy, 3)
......
...@@ -98,14 +98,17 @@ def build_norm_layer(cfg: Dict, ...@@ -98,14 +98,17 @@ def build_norm_layer(cfg: Dict,
layer_type = cfg_.pop('type') layer_type = cfg_.pop('type')
if inspect.isclass(layer_type):
norm_layer = layer_type
else:
# Switch registry to the target scope. If `norm_layer` cannot be found # Switch registry to the target scope. If `norm_layer` cannot be found
# in the registry, fallback to search `norm_layer` in the # in the registry, fallback to search `norm_layer` in the
# mmengine.MODELS. # mmengine.MODELS.
with MODELS.switch_scope_and_registry(None) as registry: with MODELS.switch_scope_and_registry(None) as registry:
norm_layer = registry.get(layer_type) norm_layer = registry.get(layer_type)
if norm_layer is None: if norm_layer is None:
raise KeyError(f'Cannot find {norm_layer} in registry under scope ' raise KeyError(f'Cannot find {norm_layer} in registry under '
f'name {registry.scope}') f'scope name {registry.scope}')
abbr = infer_abbr(norm_layer) abbr = infer_abbr(norm_layer)
assert isinstance(postfix, (int, str)) assert isinstance(postfix, (int, str))
...@@ -113,7 +116,7 @@ def build_norm_layer(cfg: Dict, ...@@ -113,7 +116,7 @@ def build_norm_layer(cfg: Dict,
requires_grad = cfg_.pop('requires_grad', True) requires_grad = cfg_.pop('requires_grad', True)
cfg_.setdefault('eps', 1e-5) cfg_.setdefault('eps', 1e-5)
if layer_type != 'GN': if norm_layer is not nn.GroupNorm:
layer = norm_layer(num_features, **cfg_) layer = norm_layer(num_features, **cfg_)
if layer_type == 'SyncBN' and hasattr(layer, '_specify_ddp_gpu_num'): if layer_type == 'SyncBN' and hasattr(layer, '_specify_ddp_gpu_num'):
layer._specify_ddp_gpu_num(1) layer._specify_ddp_gpu_num(1)
......
# Copyright (c) OpenMMLab. All rights reserved. # Copyright (c) OpenMMLab. All rights reserved.
import inspect
from typing import Dict from typing import Dict
import torch.nn as nn import torch.nn as nn
...@@ -27,7 +28,8 @@ def build_padding_layer(cfg: Dict, *args, **kwargs) -> nn.Module: ...@@ -27,7 +28,8 @@ def build_padding_layer(cfg: Dict, *args, **kwargs) -> nn.Module:
cfg_ = cfg.copy() cfg_ = cfg.copy()
padding_type = cfg_.pop('type') padding_type = cfg_.pop('type')
if inspect.isclass(padding_type):
return padding_type(*args, **kwargs, **cfg_)
# Switch registry to the target scope. If `padding_layer` cannot be found # Switch registry to the target scope. If `padding_layer` cannot be found
# in the registry, fallback to search `padding_layer` in the # in the registry, fallback to search `padding_layer` in the
# mmengine.MODELS. # mmengine.MODELS.
......
...@@ -79,14 +79,17 @@ def build_plugin_layer(cfg: Dict, ...@@ -79,14 +79,17 @@ def build_plugin_layer(cfg: Dict,
cfg_ = cfg.copy() cfg_ = cfg.copy()
layer_type = cfg_.pop('type') layer_type = cfg_.pop('type')
if inspect.isclass(layer_type):
# Switch registry to the target scope. If `plugin_layer` cannot be found plugin_layer = layer_type
# in the registry, fallback to search `plugin_layer` in the else:
# Switch registry to the target scope. If `plugin_layer` cannot be
# found in the registry, fallback to search `plugin_layer` in the
# mmengine.MODELS. # mmengine.MODELS.
with MODELS.switch_scope_and_registry(None) as registry: with MODELS.switch_scope_and_registry(None) as registry:
plugin_layer = registry.get(layer_type) plugin_layer = registry.get(layer_type)
if plugin_layer is None: if plugin_layer is None:
raise KeyError(f'Cannot find {plugin_layer} in registry under scope ' raise KeyError(
f'Cannot find {plugin_layer} in registry under scope '
f'name {registry.scope}') f'name {registry.scope}')
abbr = infer_abbr(plugin_layer) abbr = infer_abbr(plugin_layer)
......
# Copyright (c) OpenMMLab. All rights reserved. # Copyright (c) OpenMMLab. All rights reserved.
import inspect
from typing import Dict from typing import Dict
import torch import torch
...@@ -76,9 +77,12 @@ def build_upsample_layer(cfg: Dict, *args, **kwargs) -> nn.Module: ...@@ -76,9 +77,12 @@ def build_upsample_layer(cfg: Dict, *args, **kwargs) -> nn.Module:
layer_type = cfg_.pop('type') layer_type = cfg_.pop('type')
if inspect.isclass(layer_type):
upsample = layer_type
# Switch registry to the target scope. If `upsample` cannot be found # Switch registry to the target scope. If `upsample` cannot be found
# in the registry, fallback to search `upsample` in the # in the registry, fallback to search `upsample` in the
# mmengine.MODELS. # mmengine.MODELS.
else:
with MODELS.switch_scope_and_registry(None) as registry: with MODELS.switch_scope_and_registry(None) as registry:
upsample = registry.get(layer_type) upsample = registry.get(layer_type)
if upsample is None: if upsample is None:
......
...@@ -41,7 +41,7 @@ class NewEmptyTensorOp(torch.autograd.Function): ...@@ -41,7 +41,7 @@ class NewEmptyTensorOp(torch.autograd.Function):
class Conv2d(nn.Conv2d): class Conv2d(nn.Conv2d):
def forward(self, x: torch.Tensor) -> torch.Tensor: def forward(self, x: torch.Tensor) -> torch.Tensor:
if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 4)): if obsolete_torch_version(TORCH_VERSION, (1, 4)) and x.numel() == 0:
out_shape = [x.shape[0], self.out_channels] out_shape = [x.shape[0], self.out_channels]
for i, k, p, s, d in zip(x.shape[-2:], self.kernel_size, for i, k, p, s, d in zip(x.shape[-2:], self.kernel_size,
self.padding, self.stride, self.dilation): self.padding, self.stride, self.dilation):
...@@ -62,7 +62,7 @@ class Conv2d(nn.Conv2d): ...@@ -62,7 +62,7 @@ class Conv2d(nn.Conv2d):
class Conv3d(nn.Conv3d): class Conv3d(nn.Conv3d):
def forward(self, x: torch.Tensor) -> torch.Tensor: def forward(self, x: torch.Tensor) -> torch.Tensor:
if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 4)): if obsolete_torch_version(TORCH_VERSION, (1, 4)) and x.numel() == 0:
out_shape = [x.shape[0], self.out_channels] out_shape = [x.shape[0], self.out_channels]
for i, k, p, s, d in zip(x.shape[-3:], self.kernel_size, for i, k, p, s, d in zip(x.shape[-3:], self.kernel_size,
self.padding, self.stride, self.dilation): self.padding, self.stride, self.dilation):
...@@ -84,7 +84,7 @@ class Conv3d(nn.Conv3d): ...@@ -84,7 +84,7 @@ class Conv3d(nn.Conv3d):
class ConvTranspose2d(nn.ConvTranspose2d): class ConvTranspose2d(nn.ConvTranspose2d):
def forward(self, x: torch.Tensor) -> torch.Tensor: def forward(self, x: torch.Tensor) -> torch.Tensor:
if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 4)): if obsolete_torch_version(TORCH_VERSION, (1, 4)) and x.numel() == 0:
out_shape = [x.shape[0], self.out_channels] out_shape = [x.shape[0], self.out_channels]
for i, k, p, s, d, op in zip(x.shape[-2:], self.kernel_size, for i, k, p, s, d, op in zip(x.shape[-2:], self.kernel_size,
self.padding, self.stride, self.padding, self.stride,
...@@ -106,7 +106,7 @@ class ConvTranspose2d(nn.ConvTranspose2d): ...@@ -106,7 +106,7 @@ class ConvTranspose2d(nn.ConvTranspose2d):
class ConvTranspose3d(nn.ConvTranspose3d): class ConvTranspose3d(nn.ConvTranspose3d):
def forward(self, x: torch.Tensor) -> torch.Tensor: def forward(self, x: torch.Tensor) -> torch.Tensor:
if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 4)): if obsolete_torch_version(TORCH_VERSION, (1, 4)) and x.numel() == 0:
out_shape = [x.shape[0], self.out_channels] out_shape = [x.shape[0], self.out_channels]
for i, k, p, s, d, op in zip(x.shape[-3:], self.kernel_size, for i, k, p, s, d, op in zip(x.shape[-3:], self.kernel_size,
self.padding, self.stride, self.padding, self.stride,
...@@ -127,7 +127,7 @@ class MaxPool2d(nn.MaxPool2d): ...@@ -127,7 +127,7 @@ class MaxPool2d(nn.MaxPool2d):
def forward(self, x: torch.Tensor) -> torch.Tensor: def forward(self, x: torch.Tensor) -> torch.Tensor:
# PyTorch 1.9 does not support empty tensor inference yet # PyTorch 1.9 does not support empty tensor inference yet
if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 9)): if obsolete_torch_version(TORCH_VERSION, (1, 9)) and x.numel() == 0:
out_shape = list(x.shape[:2]) out_shape = list(x.shape[:2])
for i, k, p, s, d in zip(x.shape[-2:], _pair(self.kernel_size), for i, k, p, s, d in zip(x.shape[-2:], _pair(self.kernel_size),
_pair(self.padding), _pair(self.stride), _pair(self.padding), _pair(self.stride),
...@@ -145,7 +145,7 @@ class MaxPool3d(nn.MaxPool3d): ...@@ -145,7 +145,7 @@ class MaxPool3d(nn.MaxPool3d):
def forward(self, x: torch.Tensor) -> torch.Tensor: def forward(self, x: torch.Tensor) -> torch.Tensor:
# PyTorch 1.9 does not support empty tensor inference yet # PyTorch 1.9 does not support empty tensor inference yet
if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 9)): if obsolete_torch_version(TORCH_VERSION, (1, 9)) and x.numel() == 0:
out_shape = list(x.shape[:2]) out_shape = list(x.shape[:2])
for i, k, p, s, d in zip(x.shape[-3:], _triple(self.kernel_size), for i, k, p, s, d in zip(x.shape[-3:], _triple(self.kernel_size),
_triple(self.padding), _triple(self.padding),
...@@ -164,7 +164,7 @@ class Linear(torch.nn.Linear): ...@@ -164,7 +164,7 @@ class Linear(torch.nn.Linear):
def forward(self, x: torch.Tensor) -> torch.Tensor: def forward(self, x: torch.Tensor) -> torch.Tensor:
# empty tensor forward of Linear layer is supported in Pytorch 1.6 # empty tensor forward of Linear layer is supported in Pytorch 1.6
if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 5)): if obsolete_torch_version(TORCH_VERSION, (1, 5)) and x.numel() == 0:
out_shape = [x.shape[0], self.out_features] out_shape = [x.shape[0], self.out_features]
empty = NewEmptyTensorOp.apply(x, out_shape) empty = NewEmptyTensorOp.apply(x, out_shape)
if self.training: if self.training:
......
...@@ -16,13 +16,13 @@ except ImportError: ...@@ -16,13 +16,13 @@ except ImportError:
def _scale_size( def _scale_size(
size: Tuple[int, int], size: Tuple[int, int],
scale: Union[float, int, tuple], scale: Union[float, int, Tuple[float, float], Tuple[int, int]],
) -> Tuple[int, int]: ) -> Tuple[int, int]:
"""Rescale a size by a ratio. """Rescale a size by a ratio.
Args: Args:
size (tuple[int]): (w, h). size (tuple[int]): (w, h).
scale (float | tuple(float)): Scaling factor. scale (float | int | tuple(float) | tuple(int)): Scaling factor.
Returns: Returns:
tuple[int]: scaled size. tuple[int]: scaled size.
...@@ -128,7 +128,8 @@ def imresize_to_multiple( ...@@ -128,7 +128,8 @@ def imresize_to_multiple(
img: np.ndarray, img: np.ndarray,
divisor: Union[int, Tuple[int, int]], divisor: Union[int, Tuple[int, int]],
size: Union[int, Tuple[int, int], None] = None, size: Union[int, Tuple[int, int], None] = None,
scale_factor: Union[float, Tuple[float, float], None] = None, scale_factor: Union[float, int, Tuple[float, float], Tuple[int, int],
None] = None,
keep_ratio: bool = False, keep_ratio: bool = False,
return_scale: bool = False, return_scale: bool = False,
interpolation: str = 'bilinear', interpolation: str = 'bilinear',
...@@ -145,9 +146,10 @@ def imresize_to_multiple( ...@@ -145,9 +146,10 @@ def imresize_to_multiple(
divisor. If divisor is a tuple, divisor should be divisor. If divisor is a tuple, divisor should be
(w_divisor, h_divisor). (w_divisor, h_divisor).
size (None | int | tuple[int]): Target size (w, h). Default: None. size (None | int | tuple[int]): Target size (w, h). Default: None.
scale_factor (None | float | tuple[float]): Multiplier for spatial scale_factor (None | float | int | tuple[float] | tuple[int]):
size. Should match input size if it is a tuple and the 2D style is Multiplier for spatial size. Should match input size if it is a
(w_scale_factor, h_scale_factor). Default: None. tuple and the 2D style is (w_scale_factor, h_scale_factor).
Default: None.
keep_ratio (bool): Whether to keep the aspect ratio when resizing the keep_ratio (bool): Whether to keep the aspect ratio when resizing the
image. Default: False. image. Default: False.
return_scale (bool): Whether to return `w_scale` and `h_scale`. return_scale (bool): Whether to return `w_scale` and `h_scale`.
...@@ -215,16 +217,16 @@ def imresize_like( ...@@ -215,16 +217,16 @@ def imresize_like(
def rescale_size(old_size: tuple, def rescale_size(old_size: tuple,
scale: Union[float, int, tuple], scale: Union[float, int, Tuple[int, int]],
return_scale: bool = False) -> tuple: return_scale: bool = False) -> tuple:
"""Calculate the new size to be rescaled to. """Calculate the new size to be rescaled to.
Args: Args:
old_size (tuple[int]): The old size (w, h) of image. old_size (tuple[int]): The old size (w, h) of image.
scale (float | tuple[int]): The scaling factor or maximum size. scale (float | int | tuple[int]): The scaling factor or maximum size.
If it is a float number, then the image will be rescaled by this If it is a float number or an integer, then the image will be
factor, else if it is a tuple of 2 integers, then the image will rescaled by this factor, else if it is a tuple of 2 integers, then
be rescaled as large as possible within the scale. the image will be rescaled as large as possible within the scale.
return_scale (bool): Whether to return the scaling factor besides the return_scale (bool): Whether to return the scaling factor besides the
rescaled image size. rescaled image size.
...@@ -255,7 +257,7 @@ def rescale_size(old_size: tuple, ...@@ -255,7 +257,7 @@ def rescale_size(old_size: tuple,
def imrescale( def imrescale(
img: np.ndarray, img: np.ndarray,
scale: Union[float, Tuple[int, int]], scale: Union[float, int, Tuple[int, int]],
return_scale: bool = False, return_scale: bool = False,
interpolation: str = 'bilinear', interpolation: str = 'bilinear',
backend: Optional[str] = None backend: Optional[str] = None
...@@ -264,10 +266,10 @@ def imrescale( ...@@ -264,10 +266,10 @@ def imrescale(
Args: Args:
img (ndarray): The input image. img (ndarray): The input image.
scale (float | tuple[int]): The scaling factor or maximum size. scale (float | int | tuple[int]): The scaling factor or maximum size.
If it is a float number, then the image will be rescaled by this If it is a float number or an integer, then the image will be
factor, else if it is a tuple of 2 integers, then the image will rescaled by this factor, else if it is a tuple of 2 integers, then
be rescaled as large as possible within the scale. the image will be rescaled as large as possible within the scale.
return_scale (bool): Whether to return the scaling factor besides the return_scale (bool): Whether to return the scaling factor besides the
rescaled image. rescaled image.
interpolation (str): Same as :func:`resize`. interpolation (str): Same as :func:`resize`.
......
# Copyright (c) OpenMMLab. All rights reserved. # Copyright (c) OpenMMLab. All rights reserved.
from mmcv.utils import IS_MLU_AVAILABLE
from .active_rotated_filter import active_rotated_filter from .active_rotated_filter import active_rotated_filter
from .assign_score_withk import assign_score_withk from .assign_score_withk import assign_score_withk
from .ball_query import ball_query from .ball_query import ball_query
...@@ -109,3 +110,9 @@ __all__ = [ ...@@ -109,3 +110,9 @@ __all__ = [
'PrRoIPool', 'prroi_pool', 'bias_act', 'filtered_lrelu', 'conv2d', 'PrRoIPool', 'prroi_pool', 'bias_act', 'filtered_lrelu', 'conv2d',
'conv_transpose2d', 'filter2d', 'upsample2d', 'BezierAlign', 'bezier_align' 'conv_transpose2d', 'filter2d', 'upsample2d', 'BezierAlign', 'bezier_align'
] ]
if IS_MLU_AVAILABLE:
from .deform_conv import DeformConv2dPack_MLU # noqa:F401
from .modulated_deform_conv import \
ModulatedDeformConv2dPack_MLU # noqa:F401
__all__.extend(['ModulatedDeformConv2dPack_MLU', 'DeformConv2dPack_MLU'])
...@@ -116,6 +116,10 @@ def bbox_overlaps(bboxes1: torch.Tensor, ...@@ -116,6 +116,10 @@ def bbox_overlaps(bboxes1: torch.Tensor,
if rows * cols == 0: if rows * cols == 0:
return ious return ious
if bboxes1.device.type == 'cpu' and torch.__version__ == 'parrots':
return _bbox_overlaps_cpu(
bboxes1, bboxes2, mode=mode, aligned=aligned, offset=offset)
ext_module.bbox_overlaps( ext_module.bbox_overlaps(
bboxes1, bboxes2, ious, mode=mode_flag, aligned=aligned, offset=offset) bboxes1, bboxes2, ious, mode=mode_flag, aligned=aligned, offset=offset)
......
...@@ -132,6 +132,9 @@ def box_iou_rotated(bboxes1: torch.Tensor, ...@@ -132,6 +132,9 @@ def box_iou_rotated(bboxes1: torch.Tensor,
cols = bboxes2.size(0) cols = bboxes2.size(0)
if aligned: if aligned:
ious = bboxes1.new_zeros(rows) ious = bboxes1.new_zeros(rows)
else:
if bboxes1.device.type == 'mlu':
ious = bboxes1.new_zeros([rows, cols])
else: else:
ious = bboxes1.new_zeros(rows * cols) ious = bboxes1.new_zeros(rows * cols)
if not clockwise: if not clockwise:
...@@ -139,6 +142,11 @@ def box_iou_rotated(bboxes1: torch.Tensor, ...@@ -139,6 +142,11 @@ def box_iou_rotated(bboxes1: torch.Tensor,
flip_mat[-1] = -1 flip_mat[-1] = -1
bboxes1 = bboxes1 * flip_mat bboxes1 = bboxes1 * flip_mat
bboxes2 = bboxes2 * flip_mat bboxes2 = bboxes2 * flip_mat
if bboxes1.device.type == 'npu':
scale_mat = bboxes1.new_ones(bboxes1.shape[-1])
scale_mat[-1] = 1.0 / 0.01745329252
bboxes1 = bboxes1 * scale_mat
bboxes2 = bboxes2 * scale_mat
bboxes1 = bboxes1.contiguous() bboxes1 = bboxes1.contiguous()
bboxes2 = bboxes2.contiguous() bboxes2 = bboxes2.contiguous()
ext_module.box_iou_rotated( ext_module.box_iou_rotated(
......
...@@ -16,6 +16,7 @@ from typing import Dict, Optional, Tuple, Union ...@@ -16,6 +16,7 @@ from typing import Dict, Optional, Tuple, Union
import torch import torch
from mmengine.utils import digit_version from mmengine.utils import digit_version
from mmengine.utils.dl_utils.parrots_wrapper import is_rocm_pytorch
enabled = True enabled = True
weight_gradients_disabled = False weight_gradients_disabled = False
...@@ -283,19 +284,10 @@ def _conv2d_gradfix( ...@@ -283,19 +284,10 @@ def _conv2d_gradfix(
output_padding=output_padding, output_padding=output_padding,
output_mask=[0, 1, 0])[1] output_mask=[0, 1, 0])[1]
else: else:
is_rocm_pytorch = False if is_rocm_pytorch():
try: name = 'aten::miopen_convolution_transpose_backward_weight'
from torch.utils.cpp_extension import ROCM_HOME if not transpose:
is_rocm_pytorch = True if ((torch.version.hip is not None) and name = 'aten::miopen_convolution_backward_weight'
(ROCM_HOME is not None)) else False
except ImportError:
pass
name=''
flags=[]
if is_rocm_pytorch:
name = ('aten::miopen_convolution_transpose_backward_weight'
if transpose else
'aten::miopen_convolution_backward_weight')
flags = [ flags = [
torch.backends.cudnn.benchmark, torch.backends.cudnn.benchmark,
torch.backends.cudnn.deterministic torch.backends.cudnn.deterministic
......
# Copyright (c) OpenMMLab. All rights reserved. # Copyright (c) OpenMMLab. All rights reserved.
import torch import torch
from torch import Tensor, nn
from mmengine.utils import digit_version from mmengine.utils import digit_version
from torch import Tensor, nn
_mode_dict = {'top': 0, 'bottom': 1, 'left': 2, 'right': 3} _mode_dict = {'top': 0, 'bottom': 1, 'left': 2, 'right': 3}
...@@ -70,7 +71,8 @@ class CornerPool(nn.Module): ...@@ -70,7 +71,8 @@ class CornerPool(nn.Module):
self.mode = mode self.mode = mode
def forward(self, x: Tensor) -> Tensor: def forward(self, x: Tensor) -> Tensor:
if torch.__version__ != 'parrots' and digit_version(torch.__version__) >= digit_version('1.5.0'): if (torch.__version__ != 'parrots' and
digit_version(torch.__version__) >= digit_version('1.5.0')):
dim, flip = self.cummax_dim_flip[self.mode] dim, flip = self.cummax_dim_flip[self.mode]
if flip: if flip:
x = x.flip(dim) x = x.flip(dim)
......
...@@ -2,6 +2,8 @@ ...@@ -2,6 +2,8 @@
#ifndef CARAFE_CUDA_KERNEL_CUH #ifndef CARAFE_CUDA_KERNEL_CUH
#define CARAFE_CUDA_KERNEL_CUH #define CARAFE_CUDA_KERNEL_CUH
#include <ATen/cuda/DeviceUtils.cuh>
#ifdef MMCV_USE_PARROTS #ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp" #include "parrots_cuda_helper.hpp"
#else #else
...@@ -56,7 +58,8 @@ template <> ...@@ -56,7 +58,8 @@ template <>
__device__ __forceinline__ phalf warpReduceSum(phalf val) { __device__ __forceinline__ phalf warpReduceSum(phalf val) {
for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2)
#ifdef MMCV_WITH_HIP #ifdef MMCV_WITH_HIP
__PHALF(val) += __shfl_down(val, offset); // Using PyTorch's macro for half support
__PHALF(val) += WARP_SHFL_DOWN(val, offset);
#else #else
__PHALF(val) += __PHALF(val) +=
__shfl_down_sync(FULL_MASK, __PHALF(val).operator __half(), offset); __shfl_down_sync(FULL_MASK, __PHALF(val).operator __half(), offset);
......
/*************************************************************************
* Copyright (C) 2021 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include <float.h>
#include "common_mlu_helper.hpp"
#define COORD_NUM 4
__nram__ char nmem_buf[MAX_NRAM_SIZE];
template <typename T>
__mlu_func__ void computeDiv(void *nram_dst, void *nram_src0, void *nram_src1,
void *nram_addition, const int32_t deal_num) {
__bang_active_reciphp((T *)nram_dst, (T *)nram_src1, deal_num);
__bang_mul((T *)nram_dst, (T *)nram_src0, (T *)nram_dst, deal_num);
}
template <>
__mlu_func__ void computeDiv<half>(void *nram_dst, void *nram_src0,
void *nram_src1, void *nram_addition,
const int32_t deal_num) {
__bang_half2float((float *)nram_addition, (half *)nram_src1, deal_num);
__bang_active_reciphp((float *)nram_addition, (float *)nram_addition,
deal_num);
__bang_float2half_rd((half *)nram_src1, (float *)nram_addition, deal_num);
__bang_mul((half *)nram_dst, (half *)nram_src0, (half *)nram_src1, deal_num);
}
template <typename T>
__mlu_func__ void bboxOverlapsWorkflow(
T *vec_b1_x1, T *vec_b1_y1, T *vec_b1_x2, T *vec_b1_y2, T *vec_b2_x1,
T *vec_b2_y1, T *vec_b2_x2, T *vec_b2_y2, T *vec_left, T *vec_right,
T *vec_top, T *vec_bottom, const T *bbox1, const T *bbox2, void *ious,
const int32_t offset, const int32_t mode, const int32_t batches_stride,
const int32_t num_bbox1, const int32_t num_bbox2, const bool aligned) {
int32_t task_batch_stride = (num_bbox1 + taskDim - 1) / taskDim;
int32_t batch_start = taskId * task_batch_stride;
int32_t batch_per_task = batch_start + task_batch_stride < num_bbox1
? task_batch_stride
: num_bbox1 - batch_start;
batch_per_task = batch_per_task > 0 ? batch_per_task : (0);
if (aligned) {
int32_t num_loop_cpy = batch_per_task / batches_stride;
int32_t num_rem_cpy_batches = batch_per_task % batches_stride;
num_loop_cpy = num_rem_cpy_batches > 0 ? num_loop_cpy + 1 : num_loop_cpy;
for (int32_t i = 0; i < num_loop_cpy; i++) {
int32_t index = batch_start + i * batches_stride;
int32_t handle_batches = index + batches_stride > num_bbox1
? num_rem_cpy_batches
: batches_stride;
int32_t b1 = index;
int32_t b2 = index;
int32_t base1 = b1 * COORD_NUM;
__memcpy(vec_b1_x1, &bbox1[base1], sizeof(T), GDRAM2NRAM, sizeof(T),
COORD_NUM * sizeof(T), handle_batches - 1);
__memcpy(vec_b1_y1, &bbox1[base1 + 1], sizeof(T), GDRAM2NRAM, sizeof(T),
COORD_NUM * sizeof(T), handle_batches - 1);
__memcpy(vec_b1_x2, &bbox1[base1 + 2], sizeof(T), GDRAM2NRAM, sizeof(T),
COORD_NUM * sizeof(T), handle_batches - 1);
__memcpy(vec_b1_y2, &bbox1[base1 + 3], sizeof(T), GDRAM2NRAM, sizeof(T),
COORD_NUM * sizeof(T), handle_batches - 1);
int32_t base2 = b2 * COORD_NUM;
__memcpy(vec_b2_x1, &bbox2[base2], sizeof(T), GDRAM2NRAM, sizeof(T),
COORD_NUM * sizeof(T), handle_batches - 1);
__memcpy(vec_b2_y1, &bbox2[base2 + 1], sizeof(T), GDRAM2NRAM, sizeof(T),
COORD_NUM * sizeof(T), handle_batches - 1);
__memcpy(vec_b2_x2, &bbox2[base2 + 2], sizeof(T), GDRAM2NRAM, sizeof(T),
COORD_NUM * sizeof(T), handle_batches - 1);
__memcpy(vec_b2_y2, &bbox2[base2 + 3], sizeof(T), GDRAM2NRAM, sizeof(T),
COORD_NUM * sizeof(T), handle_batches - 1);
// get the width and height
__bang_maxequal(vec_left, vec_b1_x1, vec_b2_x1, batches_stride);
__bang_minequal(vec_right, vec_b1_x2, vec_b2_x2, batches_stride);
__bang_maxequal(vec_top, vec_b1_y1, vec_b2_y1, batches_stride);
__bang_minequal(vec_bottom, vec_b1_y2, vec_b2_y2, batches_stride);
// right - left + offset ---> left
__bang_sub(vec_left, vec_right, vec_left, batches_stride);
__bang_add_scalar(vec_left, vec_left, (T)offset, batches_stride);
// bottom - top + offset ---> right
__bang_sub(vec_right, vec_bottom, vec_top, batches_stride);
__bang_add_scalar(vec_right, vec_right, (T)offset, batches_stride);
// zero vector ---> bottom
__bang_write_value(vec_bottom, batches_stride, 0.f);
// width --> vec_left
__bang_maxequal(vec_left, vec_bottom, vec_left, batches_stride);
T *width = vec_left;
// height --> vec_right
__bang_maxequal(vec_right, vec_bottom, vec_right, batches_stride);
T *height = vec_right;
// get the b1_area
// (b1_x2 - b1_x1 + offset) ---> vec_top
__bang_sub(vec_top, vec_b1_x2, vec_b1_x1, batches_stride);
__bang_add_scalar(vec_top, vec_top, (T)offset, batches_stride);
// (b1_y2 - b1_y1 + offset) ---> vec_bottom
__bang_sub(vec_bottom, vec_b1_y2, vec_b1_y1, batches_stride);
__bang_add_scalar(vec_bottom, vec_bottom, (T)offset, batches_stride);
// b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset)
// ---> vec_top;
__bang_mul(vec_top, vec_top, vec_bottom, batches_stride);
T *b1_area = vec_top;
// get the b2_area
// (b2_x2 - b2_x1 + offset) ---> b2_x1
__bang_sub(vec_b2_x1, vec_b2_x2, vec_b2_x1, batches_stride);
__bang_add_scalar(vec_b2_x1, vec_b2_x1, (T)offset, batches_stride);
// (b2_y2 - b2_y1 + offset) ---> b2_y1
__bang_sub(vec_b2_y1, vec_b2_y2, vec_b2_y1, batches_stride);
__bang_add_scalar(vec_b2_y1, vec_b2_y1, (T)offset, batches_stride);
// b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset)
// ---> b2_x1;
__bang_mul(vec_b2_x1, vec_b2_x1, vec_b2_y1, batches_stride);
T *b2_area = vec_b2_x1;
// inter_s = width * height
__bang_mul(height, width, height, batches_stride);
T *inter_s = height;
// offset vector ---> vec_b2_y1
__bang_write_value(vec_b2_y1, batches_stride, T(offset));
T *vec_offset = vec_b2_y1;
if (mode == 0) {
__bang_add(b1_area, b1_area, b2_area, batches_stride);
__bang_sub(b1_area, b1_area, inter_s, batches_stride);
__bang_maxequal(b1_area, vec_offset, b1_area, batches_stride);
} else {
__bang_maxequal(b1_area, vec_offset, b1_area, batches_stride);
}
T *base_s = b1_area;
// ious = inter_s / base_s
computeDiv<T>(width, inter_s, base_s, vec_b2_x2, batches_stride);
__memcpy((T *)ious + index, width, handle_batches * sizeof(T),
NRAM2GDRAM);
}
} else {
int32_t num_loop_cpy = num_bbox2 / batches_stride;
int32_t num_rem_cpy_batches = num_bbox2 % batches_stride;
num_loop_cpy = num_rem_cpy_batches > 0 ? num_loop_cpy + 1 : num_loop_cpy;
for (int32_t i = 0; i < batch_per_task; i++) {
int32_t index1 = batch_start + i;
int32_t b1 = index1;
int32_t base1 = b1 * COORD_NUM;
// set bbox1 and bbox2 to nram
__bang_write_value(vec_b1_x1, batches_stride, bbox1[base1]);
__bang_write_value(vec_b1_y1, batches_stride, bbox1[base1 + 1]);
__bang_write_value(vec_b1_x2, batches_stride, bbox1[base1 + 2]);
__bang_write_value(vec_b1_y2, batches_stride, bbox1[base1 + 3]);
for (int32_t j = 0; j < num_loop_cpy; j++) {
int32_t index2 = j * batches_stride;
int32_t handle_batches = index2 + batches_stride > num_bbox2
? num_rem_cpy_batches
: batches_stride;
int32_t b2 = index2;
int32_t base2 = b2 * COORD_NUM;
// copy bbox2 to nram
__memcpy(vec_b2_x1, &bbox2[base2], sizeof(T), GDRAM2NRAM, sizeof(T),
COORD_NUM * sizeof(T), handle_batches - 1);
__memcpy(vec_b2_y1, &bbox2[base2 + 1], sizeof(T), GDRAM2NRAM, sizeof(T),
COORD_NUM * sizeof(T), handle_batches - 1);
__memcpy(vec_b2_x2, &bbox2[base2 + 2], sizeof(T), GDRAM2NRAM, sizeof(T),
COORD_NUM * sizeof(T), handle_batches - 1);
__memcpy(vec_b2_y2, &bbox2[base2 + 3], sizeof(T), GDRAM2NRAM, sizeof(T),
COORD_NUM * sizeof(T), handle_batches - 1);
// get the width and height
__bang_maxequal(vec_left, vec_b1_x1, vec_b2_x1, batches_stride);
__bang_minequal(vec_right, vec_b1_x2, vec_b2_x2, batches_stride);
__bang_maxequal(vec_top, vec_b1_y1, vec_b2_y1, batches_stride);
__bang_minequal(vec_bottom, vec_b1_y2, vec_b2_y2, batches_stride);
// right - left + offset ---> left
__bang_sub(vec_left, vec_right, vec_left, batches_stride);
__bang_add_scalar(vec_left, vec_left, (T)offset, batches_stride);
// bottom - top + offset ---> right
__bang_sub(vec_right, vec_bottom, vec_top, batches_stride);
__bang_add_scalar(vec_right, vec_right, (T)offset, batches_stride);
// zero vector ---> bottom
__bang_write_value(vec_bottom, batches_stride, (T)0);
// width --> vec_left
__bang_maxequal(vec_left, vec_bottom, vec_left, batches_stride);
T *width = vec_left;
// height --> vec_right
__bang_maxequal(vec_right, vec_bottom, vec_right, batches_stride);
T *height = vec_right;
// get the b1_area
// (b1_x2 - b1_x1 + offset) ---> vec_top
__bang_sub(vec_top, vec_b1_x2, vec_b1_x1, batches_stride);
__bang_add_scalar(vec_top, vec_top, (T)offset, batches_stride);
// (b1_y2 - b1_y1 + offset) ---> vec_bottom
__bang_sub(vec_bottom, vec_b1_y2, vec_b1_y1, batches_stride);
__bang_add_scalar(vec_bottom, vec_bottom, (T)offset, batches_stride);
// b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset)
// ---> vec_top;
__bang_mul(vec_top, vec_top, vec_bottom, batches_stride);
T *b1_area = vec_top;
// get the b2_area
// (b2_x2 - b2_x1 + offset) ---> b2_x1
__bang_sub(vec_b2_x1, vec_b2_x2, vec_b2_x1, batches_stride);
__bang_add_scalar(vec_b2_x1, vec_b2_x1, (T)offset, batches_stride);
// (b2_y2 - b2_y1 + offset) ---> b2_y1
__bang_sub(vec_b2_y1, vec_b2_y2, vec_b2_y1, batches_stride);
__bang_add_scalar(vec_b2_y1, vec_b2_y1, (T)offset, batches_stride);
// b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset)
// ---> b2_x1;
__bang_mul(vec_b2_x1, vec_b2_x1, vec_b2_y1, batches_stride);
T *b2_area = vec_b2_x1;
// inter_s = width * height
__bang_mul(height, width, height, batches_stride);
T *inter_s = height;
// offset vector ---> vec_b2_y1
__bang_write_value(vec_b2_y1, batches_stride, T(offset));
T *vec_offset = vec_b2_y1;
if (mode == 0) {
__bang_add(b1_area, b1_area, b2_area, batches_stride);
__bang_sub(b1_area, b1_area, inter_s, batches_stride);
__bang_maxequal(b1_area, vec_offset, b1_area, batches_stride);
} else {
__bang_maxequal(b1_area, vec_offset, b1_area, batches_stride);
}
T *base_s = b1_area;
// ious = inter_s / base_s
computeDiv<T>(width, inter_s, base_s, vec_b2_x2, batches_stride);
int32_t gdram_offset = index1 * num_bbox2 + index2;
__memcpy((T *)ious + gdram_offset, width, handle_batches * sizeof(T),
NRAM2GDRAM);
}
}
}
}
template <typename T>
__mlu_global__ void MLUUnion1KernelBBoxOverlaps(
const void *bbox1, const void *bbox2, void *ious, const int32_t num_bbox1,
const int32_t num_bbox2, const int32_t mode, const bool aligned,
const int32_t offset) {
/*
* NRAM partition
* |-------------------------------------------------------------|
* | vec_b1_x1 | vec_b1_y1 | vec_b1_x2 | vec_b1_y2 |
* |-------------------------------------------------------------|
* | vec_b2_x1 | vec_b2_y1 | vec_b2_x2 | vec_b2_y2 |
* |-------------------------------------------------------------|
* | vec_left | vec_right | vec_top | vec_bottom |
* |-------------------------------------------------------------|
*
*/
const int32_t align_bytes = PAD_DOWN(MAX_NRAM_SIZE, NFU_ALIGN_SIZE);
const int32_t split_nram_num = 12;
const int32_t nram_stride =
align_bytes / NFU_ALIGN_SIZE / split_nram_num * NFU_ALIGN_SIZE;
void *vec_b1_x1 = nmem_buf;
void *vec_b1_y1 = nmem_buf + nram_stride;
void *vec_b1_x2 = nmem_buf + 2 * nram_stride;
void *vec_b1_y2 = nmem_buf + 3 * nram_stride;
void *vec_b2_x1 = nmem_buf + 4 * nram_stride;
void *vec_b2_y1 = nmem_buf + 5 * nram_stride;
void *vec_b2_x2 = nmem_buf + 6 * nram_stride;
void *vec_b2_y2 = nmem_buf + 7 * nram_stride;
void *vec_left = nmem_buf + 8 * nram_stride;
void *vec_right = nmem_buf + 9 * nram_stride;
void *vec_top = nmem_buf + 10 * nram_stride;
void *vec_bottom = nmem_buf + 11 * nram_stride;
const int32_t vec_length = nram_stride / sizeof(T);
bboxOverlapsWorkflow((T *)vec_b1_x1, (T *)vec_b1_y1, (T *)vec_b1_x2,
(T *)vec_b1_y2, (T *)vec_b2_x1, (T *)vec_b2_y1,
(T *)vec_b2_x2, (T *)vec_b2_y2, (T *)vec_left,
(T *)vec_right, (T *)vec_top, (T *)vec_bottom,
(T *)bbox1, (T *)bbox2, (T *)ious, offset, mode,
vec_length, num_bbox1, num_bbox2, aligned);
}
void KernelBBoxOverlaps(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
cnrtQueue_t queue, const cnrtDataType_t d_type,
const void *bbox1, const void *bbox2, void *ious,
const int32_t num_bbox1, const int32_t num_bbox2,
const int32_t mode, const bool aligned,
const int32_t offset) {
if (d_type == CNRT_FLOAT16) {
MLUUnion1KernelBBoxOverlaps<half><<<k_dim, k_type, queue>>>(
bbox1, bbox2, ious, num_bbox1, num_bbox2, mode, aligned, offset);
} else {
MLUUnion1KernelBBoxOverlaps<float><<<k_dim, k_type, queue>>>(
bbox1, bbox2, ious, num_bbox1, num_bbox2, mode, aligned, offset);
}
}
This diff is collapsed.
/*************************************************************************
* Copyright (C) 2022 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#ifndef CARAFE_UTILS_HPP_
#define CARAFE_UTILS_HPP_
#define NRAM_ALIGN_SIZE 64
struct CarafeForwardParam {
int N; // batch size
int Hi; // input height
int Wi; // input width
int Ci; // input channels
int Ho; // output height
int Wo; // output width
int Cg; // channels per group
int kernel_size; // kernel_size
int group_size; // group_size
int scale_factor; // scale_factor
int kernel_size_half; // kernel half size (K-1)/2
int kernel_size_sq; // square of kernel size
int dtype_size; // size of tensor data type
// Host arrays' geometry
int input_stride_g;
int input_stride_w;
int input_stride_h;
int input_stride_n;
int input_size;
int mask_stride_kh;
int mask_stride_g;
int mask_stride_w;
int mask_stride_h;
int mask_stride_n;
int mask_size;
int output_stride_g;
int output_stride_w;
int output_stride_h;
int output_stride_n;
int output_size;
// NRAM arrays' geometry
int input_nram_stride_g;
int input_nram_stride_w;
int input_nram_stride_h;
int input_nram_size;
int mask_nram_stride_kh;
int mask_nram_stride_g;
int mask_nram_stride_w;
int mask_nram_stride_h;
int mask_nram_size;
int output_nram_stride_g;
int output_nram_stride_w;
int output_nram_stride_h;
int output_nram_size;
// for address/compute alignment
int align_size_NRAM; // for addressing on NRAM
int align_size_NFU; // for NFU operation length
int block_Cg_NFU; // for bang_mul_const
int job_num; // total job number
};
struct CarafeForwardBlockDim {
int Ho; // block size of output height
int Wo; // block size of output width
int Kh; // block size of kernel height
int Kw; // block size of kernel width
int G; // block size of groups
int Cg; // block size of channels within a group
int Hi; // block size of input height
int Wi; // block size of input width
};
struct CarafeForwardGridDim {
int Ho; // number of blocks of output height
int Wo;
int Kh;
int Kw;
int G;
int Cg;
};
#endif // CARAFE_UTILS_HPP_
...@@ -45,148 +45,6 @@ __mlu_func__ inline scalar_t max(scalar_t a, scalar_t b) { ...@@ -45,148 +45,6 @@ __mlu_func__ inline scalar_t max(scalar_t a, scalar_t b) {
return a > b ? a : b; return a > b ? a : b;
} }
/*!
* @brief loads data from global DRAM to NRAM with 2D pattern.
*
* @param[out] dst
* Pointer to NRAM that stores dst data.
* @param[in] src
* Pointer to global DRAM that stores src data.
* @param[in] size
* The byte size of segment in the lower dimension.
* @param[in] dst_str
* The data stride in bytes between segments in the lower dimension of dst.
* @param[in] src_str
* The data stride in bytes between segments in the lower dimension of src.
* @param[in] seg_num
* The total count of data segments in the lower dimension.
*/
template <typename T>
__mlu_func__ void loadStr2D(T *dst, T *src, const int size, const int dst_str,
const int src_str, const int seg_num) {
if (dst_str == src_str && size == src_str) {
__memcpy(dst, src, src_str * seg_num * sizeof(T), GDRAM2NRAM);
} else if ((size == src_str || src_str <= dst_str) &&
src_str * sizeof(T) <= 512) {
// gather data less than 512Bytes to improve IO efficiency
T *tmp = (T *)dst + (dst_str - src_str) * seg_num;
__memcpy(tmp, src, (src_str * (seg_num - 1) + size) * sizeof(T),
GDRAM2NRAM);
if (dst_str != src_str) {
__memcpy(dst, tmp, size * sizeof(T), NRAM2NRAM, dst_str * sizeof(T),
src_str * sizeof(T), seg_num - 1);
}
} else {
__memcpy(dst, src, size * sizeof(T), GDRAM2NRAM, dst_str * sizeof(T),
src_str * sizeof(T), seg_num - 1);
}
}
/*!
* @brief loads data from global DRAM to NRAM with 3D pattern.
*
* @param[out] dst
* Pointer to NRAM that stores dst data.
* @param[in] src
* Pointer to global DRAM that stores src data.
* @param[in] size
* The byte size of segment in the lowest dimension.
* @param[in] seg_num_in
* The total count of data segments in the lowest dimension.
* @param[in] seg_num_out
* The total count of data segments in the middle dimension.
* @param[in] dst_str_in
* The data stride in bytes between segments in the lowest dimension of dst.
* @param[in] dst_str_out
* The data stride in bytes between segments in the middle dimension of dst.
* @param[in] src_str_in
* The data stride in bytes between segments in the lowest dimension of src.
* @param[in] src_str_out
* The data stride in bytes between segments in the middle dimension of src.
*/
template <typename T>
__mlu_func__ void loadStr3D(T *dst, T *src, const int size,
const int seg_num_in, const int seg_num_out,
const int dst_str_in, const int dst_str_out,
const int src_str_in, const int src_str_out) {
T *tmp_dst = dst;
T *tmp_src = src;
for (int i = 0; i < seg_num_out; ++i) {
loadStr2D(tmp_dst, tmp_src, size, dst_str_in, src_str_in, seg_num_in);
tmp_src += src_str_out;
tmp_dst += dst_str_out;
}
}
/*!
* @brief stores data from NRAM to global DRAM with 2D pattern.
*
* @param[out] dst
* Pointer to global DRAM that stores dst data.
* @param[in] src
* Pointer to NRAM that stores src data.
* @param[in] size
* The byte size of segment in the lower dimension.
* @param[in] dst_str
* The data stride in bytes between segments in the lower dimension of dst.
* @param[in] src_str
* The data stride in bytes between segments in the lower dimension of src.
* @param[in] seg_num
* The total count of data segments in the lower dimension.
*/
template <typename T>
__mlu_func__ void storeStr2D(T *dst, T *src, const int size, const int seg_num,
const int dst_str, const int src_str) {
if ((size == dst_str && dst_str <= src_str) && dst_str * sizeof(T) <= 512) {
// gather data less than 512Bytes to improve IO efficiency
if (dst_str != src_str) {
__memcpy(src, src, size * sizeof(T), NRAM2NRAM, dst_str * sizeof(T),
src_str * sizeof(T), seg_num - 1);
}
__memcpy(dst, src, size * seg_num * sizeof(T), NRAM2GDRAM);
} else {
__memcpy(dst, src, size * sizeof(T), NRAM2GDRAM, dst_str * sizeof(T),
src_str * sizeof(T), seg_num - 1);
}
}
/*!
* @brief stores data from NRAM to global DRAM with 3D pattern.
*
* @param[out] dst
* Pointer to global DRAM that stores dst data.
* @param[in] src
* Pointer to NRAM that stores src data.
* @param[in] size
* The byte size of segment in the lowest dimension.
* @param[in] seg_num_in
* The total count of data segments in the lowest dimension.
* @param[in] seg_num_out
* The total count of data segments in the middle dimension.
* @param[in] dst_str_in
* The data stride in bytes between segments in the lowest dimension of dst.
* @param[in] dst_str_out
* The data stride in bytes between segments in the middle dimension of dst.
* @param[in] src_str_in
* The data stride in bytes between segments in the lowest dimension of src.
* @param[in] src_str_out
* The data stride in bytes between segments in the middle dimension of src.
*/
template <typename T>
__mlu_func__ void storeStr3D(T *dst, T *src, const int size,
const int seg_num_in, const int seg_num_out,
const int dst_str_in, const int dst_str_out,
const int src_str_in, const int src_str_out) {
T *tmp_dst = dst;
T *tmp_src = src;
for (int i = 0; i < seg_num_out; ++i) {
storeStr2D(tmp_dst, tmp_src, size, seg_num_in, dst_str_in, src_str_in);
tmp_src += src_str_out;
tmp_dst += dst_str_out;
}
}
/*! /*!
* @brief Converts int32 to float32 data type. * @brief Converts int32 to float32 data type.
* *
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment