Unverified Commit c0f5492e authored by zhuyuanhao's avatar zhuyuanhao Committed by GitHub
Browse files

add ext ops, support parrots (#310)



* add ext ops, support parrots

* fix lint

* fix lint

* update op from mmdetection

* support non-pytorch env

* fix import bug

* test not import mmcv.op

* rename mmcv.op to mmcv.ops

* fix compile warning

* 1. fix syncbn warning in pytorch 1.5
2. support only cpu compile
3. add point_sample from mmdet

* fix text bug

* update docstrings

* fix line endings

* minor updates

* remove non_local from ops

* bug fix for nonlocal2d

* rename ops_ext to _ext and _ext to _flow_warp_ext

* update the doc

* try clang-format github action

* fix github action

* add ops to api.rst

* fix cpp format

* fix clang format issues

* remove .clang-format
Co-authored-by: default avatarKai Chen <chenkaidev@gmail.com>
parent a7bf7701
...@@ -7,46 +7,51 @@ on: [push, pull_request] ...@@ -7,46 +7,51 @@ on: [push, pull_request]
jobs: jobs:
build: build:
runs-on: ubuntu-latest runs-on: ubuntu-latest
strategy: strategy:
matrix: matrix:
python-version: [3.6, 3.7, 3.8] python-version: [3.6, 3.7, 3.8]
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }} - name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v1 uses: actions/setup-python@v1
with: with:
python-version: ${{ matrix.python-version }} python-version: ${{ matrix.python-version }}
- name: Install linting dependencies - name: Install linting dependencies
run: | run: |
python -m pip install --upgrade pip python -m pip install --upgrade pip
pip install flake8 isort yapf pip install flake8 isort yapf
- name: Lint with flake8 - name: Lint with flake8
run: flake8 --max-complexity 20 . run: flake8 --max-complexity 20 .
- name: Lint with isort - name: Lint with isort
run: isort -rc --check-only --diff mmcv/ tests/ examples/ run: isort -rc --check-only --diff mmcv/ tests/ examples/
- name: Format with yapf - name: Format python codes with yapf
run: yapf -r -d mmcv/ tests/ examples/ run: yapf -r -d mmcv/ tests/ examples/
- name: Build and install - name: Format c/cuda codes with clang-format
run: rm -rf .eggs && pip install -e . uses: DoozyX/clang-format-lint-action@v0.6
- name: Install system dependencies with:
run: sudo apt-get update && sudo apt-get install -y ffmpeg libturbojpeg source: mmcv/ops/csrc
- name: Install unittest dependencies extensions: h,c,cpp,hpp,cu,cuh
run: | style: google
pip install pytest coverage lmdb PyTurboJPEG - name: Build and install
pip install torch==1.4.0+cpu torchvision==0.5.0+cpu -f https://download.pytorch.org/whl/torch_stable.html run: rm -rf .eggs && pip install -e .
- name: Run unittests and generate coverage report - name: Install system dependencies
run: | run: sudo apt-get update && sudo apt-get install -y ffmpeg libturbojpeg
coverage run --branch --source=mmcv -m pytest tests/ - name: Install unittest dependencies
coverage xml run: |
coverage report -m pip install pytest coverage lmdb PyTurboJPEG
- name: Upload coverage to Codecov pip install torch==1.4.0+cpu torchvision==0.5.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
uses: codecov/codecov-action@master - name: Run unittests and generate coverage report
with: run: |
file: ./coverage.xml coverage run --branch --source=mmcv -m pytest tests/
flags: unittests coverage xml
env_vars: OS,PYTHON coverage report -m
name: codecov-umbrella - name: Upload coverage to Codecov
fail_ci_if_error: false uses: codecov/codecov-action@master
with:
file: ./coverage.xml
flags: unittests
env_vars: OS,PYTHON
name: codecov-umbrella
fail_ci_if_error: false
...@@ -28,3 +28,11 @@ repos: ...@@ -28,3 +28,11 @@ repos:
args: ["--remove"] args: ["--remove"]
- id: mixed-line-ending - id: mixed-line-ending
args: ["--fix=lf"] args: ["--fix=lf"]
- repo: local
hooks:
- id: clang-format
name: clang-format
description: Format files with ClangFormat
entry: clang-format -i
language: system
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|cuh|proto)$
...@@ -24,12 +24,12 @@ and `MMAction <https://github.com/open-mmlab/mmaction>`_. ...@@ -24,12 +24,12 @@ and `MMAction <https://github.com/open-mmlab/mmaction>`_.
It provides the following functionalities. It provides the following functionalities.
- Universal IO APIs - Universal IO APIs
- Image processing - Image/Video processing
- Video processing
- Image and annotation visualization - Image and annotation visualization
- Useful utilities (progress bar, timer, ...) - Useful utilities (progress bar, timer, ...)
- PyTorch runner with hooking mechanism - PyTorch runner with hooking mechanism
- Various CNN architectures - Various CNN architectures
- High-quality implementation of common CUDA ops
See the `documentation <http://mmcv.readthedocs.io/en/latest>`_ for more features and usage. See the `documentation <http://mmcv.readthedocs.io/en/latest>`_ for more features and usage.
......
...@@ -41,3 +41,8 @@ runner ...@@ -41,3 +41,8 @@ runner
------ ------
.. automodule:: mmcv.runner .. automodule:: mmcv.runner
:members: :members:
ops
------
.. automodule:: mmcv.ops
:members:
\ No newline at end of file
...@@ -90,3 +90,44 @@ conv1 = nn.Conv2d(3, 3, 1) ...@@ -90,3 +90,44 @@ conv1 = nn.Conv2d(3, 3, 1)
normal_init(conv1, std=0.01, bias=0) normal_init(conv1, std=0.01, bias=0)
xavier_init(conv1, distribution='uniform') xavier_init(conv1, distribution='uniform')
``` ```
### Model Zoo
Besides torchvision pre-trained models, we also provide pre-trained models of following CNN:
- VGG Caffe
- ResNet Caffe
- ResNeXt
- ResNet with Group Normalization
- ResNet with Group Normalization and Weight Standardization
- HRNetV2
- Res2Net
- RegNet
#### Model URLs in JSON
The model zoo links in MMCV are managed by JSON files.
The json file consists of key-value pair of model name and its url or path.
An example json file could be like:
```json
{
"model_a": "https://example.com/models/model_a_9e5bac.pth",
"model_b": "pretrain/model_b_ab3ef2c.pth"
}
```
The default links of the pre-trained models hosted on Open-MMLab AWS could be found [here](../mmcv/model_zoo/open_mmlab.json).
You may override default links by putting `open-mmlab.json` under `MMCV_HOME`. If `MMCV_HOME` is not find in the environment, `~/.cache/mmcv` will be used by default. You may `export MMCV_HOME=/your/path` to use your own path.
The external json files will be merged into default one. If the same key presents in both external json and default json, the external one will be used.
#### Load Checkpoint
The following types are supported for `filename` argument of `mmcv.load_checkpoint()`.
- filepath: The filepath of the checkpoint.
- `http://xxx` and `https://xxx`: The link to download the checkpoint. The `SHA256` postfix should be contained in the filename.
- `torchvison://xxx`: The model links in `torchvision.models`.Please refer to [torchvision](https://pytorch.org/docs/stable/torchvision/models.html) for details.
- `open-mmlab://xxx`: The model links or filepath provided in default and additional json files.
...@@ -48,7 +48,10 @@ extensions = [ ...@@ -48,7 +48,10 @@ extensions = [
'recommonmark', 'recommonmark',
] ]
autodoc_mock_imports = ['cv2', 'mmcv._ext', 'torchvision'] autodoc_mock_imports = [
'cv2', 'mmcv._ext', 'mmcv._flow_warp_ext', 'mmcv.utils.ext_loader',
'torchvision'
]
# Add any paths that contain templates here, relative to this directory. # Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates'] templates_path = ['_templates']
......
...@@ -13,7 +13,7 @@ Contents ...@@ -13,7 +13,7 @@ Contents
utils.md utils.md
runner.md runner.md
cnn.md cnn.md
model_zoo.md ops.md
api.rst api.rst
......
## Model Zoo
Besides torchvision pre-trained models, we also provide pre-trained models of following CNN:
* VGG Caffe
* ResNet Caffe
* ResNeXt
* ResNet with Group Normalization
* ResNet with Group Normalization and Weight Standardization
* HRNetV2
* Res2Net
* RegNet
### Model URLs in JSON
The model zoo links in MMCV are managed by JSON files.
The json file consists of key-value pair of model name and its url or path.
An example json file could be like:
```json
{
"model_a": "https://example.com/models/model_a_9e5bac.pth",
"model_b": "pretrain/model_b_ab3ef2c.pth"
}
```
The default links of the pre-trained models hosted on Open-MMLab AWS could be found [here](../mmcv/model_zoo/open_mmlab.json).
You may override default links by putting `open-mmlab.json` under `MMCV_HOME`. If `MMCV_HOME` is not find in the environment, `~/.cache/mmcv` will be used by default. You may `export MMCV_HOME=/your/path` to use your own path.
The external json files will be merged into default one. If the same key presents in both external json and default json, the external one will be used.
### Load Checkpoint
The following types are supported for `filename` argument of `mmcv.load_checkpoint()`.
* filepath: The filepath of the checkpoint.
* `http://xxx` and `https://xxx`: The link to download the checkpoint. The `SHA256` postfix should be contained in the filename.
* `torchvison://xxx`: The model links in `torchvision.models`.Please refer to [torchvision](https://pytorch.org/docs/stable/torchvision/models.html) for details.
* `open-mmlab://xxx`: The model links or filepath provided in default and additional json files.
## CUDA ops
We implement common CUDA ops used in detection, segmentation, etc.
- BBoxOverlaps
- CARAFE
- CrissCrossAttention
- ContextBlock
- CornerPool
- Deformable Convolution v1/v2
- Deformable RoIPool
- GeneralizedAttention
- MaskedConv
- NMS
- PSAMask
- RoIPool
- RoIAlign
- SimpleRoIAlign
- SigmoidFocalLoss
- SoftmaxFocalLoss
- SoftNMS
- Synchronized BatchNorm
- Weight standardization
...@@ -12,3 +12,4 @@ from .visualization import * ...@@ -12,3 +12,4 @@ from .visualization import *
# without PyTorch. # without PyTorch.
# - runner # - runner
# - parallel # - parallel
# - op
from .bbox import bbox_overlaps
from .carafe import CARAFE, CARAFENaive, CARAFEPack, carafe, carafe_naive
from .cc_attention import CrissCrossAttention
from .context_block import ContextBlock
from .conv_ws import ConvWS2d, conv_ws_2d
from .corner_pool import CornerPool
from .deform_conv import DeformConv2d, DeformConv2dPack, deform_conv2d
from .deform_roi_pool import (DeformRoIPool, DeformRoIPoolPack,
ModulatedDeformRoIPoolPack, deform_roi_pool)
from .focal_loss import (SigmoidFocalLoss, SoftmaxFocalLoss,
sigmoid_focal_loss, softmax_focal_loss)
from .generalized_attention import GeneralizedAttention
from .info import get_compiler_version, get_compiling_cuda_version
from .masked_conv import MaskedConv2d, masked_conv2d
from .modulated_deform_conv import (ModulatedDeformConv2d,
ModulatedDeformConv2dPack,
modulated_deform_conv2d)
from .nms import batched_nms, nms, nms_match, soft_nms
from .plugin import build_plugin_layer
from .point_sample import (SimpleRoIAlign, point_sample,
rel_roi_point_to_rel_img_point)
from .psa_mask import PSAMask
from .roi_align import RoIAlign, roi_align
from .roi_pool import RoIPool, roi_pool
from .sync_bn import SyncBatchNorm
from .wrappers import Conv2d, ConvTranspose2d, Linear, MaxPool2d
__all__ = [
'bbox_overlaps', 'CARAFE', 'CARAFENaive', 'CARAFEPack', 'carafe',
'carafe_naive', 'ContextBlock', 'ConvWS2d', 'conv_ws_2d', 'CornerPool',
'DeformConv2d', 'DeformConv2dPack', 'deform_conv2d', 'DeformRoIPool',
'DeformRoIPoolPack', 'ModulatedDeformRoIPoolPack', 'deform_roi_pool',
'SigmoidFocalLoss', 'SoftmaxFocalLoss', 'sigmoid_focal_loss',
'softmax_focal_loss', 'GeneralizedAttention', 'get_compiler_version',
'get_compiling_cuda_version', 'MaskedConv2d', 'masked_conv2d',
'ModulatedDeformConv2d', 'ModulatedDeformConv2dPack',
'modulated_deform_conv2d', 'batched_nms', 'nms', 'soft_nms', 'nms_match',
'build_plugin_layer', 'RoIAlign', 'roi_align', 'RoIPool', 'roi_pool',
'SyncBatchNorm', 'Conv2d', 'ConvTranspose2d', 'Linear', 'MaxPool2d',
'CrissCrossAttention', 'PSAMask', 'point_sample',
'rel_roi_point_to_rel_img_point', 'SimpleRoIAlign'
]
from ..utils import ext_loader
ext_module = ext_loader.load_ext('_ext', ['bbox_overlaps'])
def bbox_overlaps(bboxes1, bboxes2, mode='iou', aligned=False, offset=0):
"""Calculate overlap between two set of bboxes.
If ``aligned`` is ``False``, then calculate the ious between each bbox
of bboxes1 and bboxes2, otherwise the ious between each aligned pair of
bboxes1 and bboxes2.
Args:
bboxes1 (Tensor): shape (m, 4) in <x1, y1, x2, y2> format or empty.
bboxes2 (Tensor): shape (n, 4) in <x1, y1, x2, y2> format or empty.
If aligned is ``True``, then m and n must be equal.
mode (str): "iou" (intersection over union) or iof (intersection over
foreground).
Returns:
ious(Tensor): shape (m, n) if aligned == False else shape (m, 1)
Example:
>>> bboxes1 = torch.FloatTensor([
>>> [0, 0, 10, 10],
>>> [10, 10, 20, 20],
>>> [32, 32, 38, 42],
>>> ])
>>> bboxes2 = torch.FloatTensor([
>>> [0, 0, 10, 20],
>>> [0, 10, 10, 19],
>>> [10, 10, 20, 20],
>>> ])
>>> bbox_overlaps(bboxes1, bboxes2)
tensor([[0.5000, 0.0000, 0.0000],
[0.0000, 0.0000, 1.0000],
[0.0000, 0.0000, 0.0000]])
Example:
>>> empty = torch.FloatTensor([])
>>> nonempty = torch.FloatTensor([
>>> [0, 0, 10, 9],
>>> ])
>>> assert tuple(bbox_overlaps(empty, nonempty).shape) == (0, 1)
>>> assert tuple(bbox_overlaps(nonempty, empty).shape) == (1, 0)
>>> assert tuple(bbox_overlaps(empty, empty).shape) == (0, 0)
"""
mode_dict = {'iou': 0, 'iof': 1}
assert mode in mode_dict.keys()
mode_flag = mode_dict[mode]
# Either the boxes are empty or the length of boxes's last dimenstion is 4
assert (bboxes1.size(-1) == 4 or bboxes1.size(0) == 0)
assert (bboxes2.size(-1) == 4 or bboxes2.size(0) == 0)
assert offset == 1 or offset == 0
rows = bboxes1.size(0)
cols = bboxes2.size(0)
if aligned:
assert rows == cols
if rows * cols == 0:
return bboxes1.new(rows, 1) if aligned else bboxes1.new(rows, cols)
if aligned:
ious = bboxes1.new_zeros(rows)
else:
ious = bboxes1.new_zeros((rows, cols))
ext_module.bbox_overlaps(
bboxes1, bboxes2, ious, mode=mode_flag, aligned=aligned, offset=offset)
return ious
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Function
from torch.nn.modules.module import Module
from ..cnn import UPSAMPLE_LAYERS, normal_init, xavier_init
from ..utils import ext_loader
ext_module = ext_loader.load_ext('_ext', [
'carafe_naive_forward', 'carafe_naive_backward', 'carafe_forward',
'carafe_backward'
])
class CARAFENaiveFunction(Function):
@staticmethod
def symbolic(g, features, masks, kernel_size, group_size, scale_factor):
return g.op(
'MMCVCARAFENaive',
features,
masks,
kernel_size=kernel_size,
group_size=group_size,
scale_factor=scale_factor)
@staticmethod
def forward(ctx, features, masks, kernel_size, group_size, scale_factor):
assert scale_factor >= 1
assert masks.size(1) == kernel_size * kernel_size * group_size
assert masks.size(-1) == features.size(-1) * scale_factor
assert masks.size(-2) == features.size(-2) * scale_factor
assert features.size(1) % group_size == 0
assert (kernel_size - 1) % 2 == 0 and kernel_size >= 1
ctx.kernel_size = kernel_size
ctx.group_size = group_size
ctx.scale_factor = scale_factor
ctx.feature_size = features.size()
ctx.mask_size = masks.size()
n, c, h, w = features.size()
output = features.new_zeros((n, c, h * scale_factor, w * scale_factor))
ext_module.carafe_naive_forward(
features,
masks,
output,
kernel_size=kernel_size,
group_size=group_size,
scale_factor=scale_factor)
if features.requires_grad or masks.requires_grad:
ctx.save_for_backward(features, masks)
return output
@staticmethod
def backward(ctx, grad_output):
assert grad_output.is_cuda
features, masks = ctx.saved_tensors
kernel_size = ctx.kernel_size
group_size = ctx.group_size
scale_factor = ctx.scale_factor
grad_input = torch.zeros_like(features)
grad_masks = torch.zeros_like(masks)
ext_module.carafe_naive_backward(
grad_output.contiguous(),
features,
masks,
grad_input,
grad_masks,
kernel_size=kernel_size,
group_size=group_size,
scale_factor=scale_factor)
return grad_input, grad_masks, None, None, None
carafe_naive = CARAFENaiveFunction.apply
class CARAFENaive(Module):
def __init__(self, kernel_size, group_size, scale_factor):
super(CARAFENaive, self).__init__()
assert isinstance(kernel_size, int) and isinstance(
group_size, int) and isinstance(scale_factor, int)
self.kernel_size = kernel_size
self.group_size = group_size
self.scale_factor = scale_factor
def forward(self, features, masks):
return carafe_naive(features, masks, self.kernel_size, self.group_size,
self.scale_factor)
class CARAFEFunction(Function):
@staticmethod
def symbolic(g, features, masks, kernel_size, group_size, scale_factor):
return g.op(
'MMCVCARAFE',
features,
masks,
kernel_size=kernel_size,
group_size=group_size,
scale_factor=scale_factor)
@staticmethod
def forward(ctx, features, masks, kernel_size, group_size, scale_factor):
assert scale_factor >= 1
assert masks.size(1) == kernel_size * kernel_size * group_size
assert masks.size(-1) == features.size(-1) * scale_factor
assert masks.size(-2) == features.size(-2) * scale_factor
assert features.size(1) % group_size == 0
assert (kernel_size - 1) % 2 == 0 and kernel_size >= 1
ctx.kernel_size = kernel_size
ctx.group_size = group_size
ctx.scale_factor = scale_factor
ctx.feature_size = features.size()
ctx.mask_size = masks.size()
n, c, h, w = features.size()
output = features.new_zeros((n, c, h * scale_factor, w * scale_factor))
routput = features.new_zeros(output.size(), requires_grad=False)
rfeatures = features.new_zeros(features.size(), requires_grad=False)
rmasks = masks.new_zeros(masks.size(), requires_grad=False)
ext_module.carafe_forward(
features,
masks,
rfeatures,
routput,
rmasks,
output,
kernel_size=kernel_size,
group_size=group_size,
scale_factor=scale_factor)
if features.requires_grad or masks.requires_grad:
ctx.save_for_backward(features, masks, rfeatures)
return output
@staticmethod
def backward(ctx, grad_output):
assert grad_output.is_cuda
features, masks, rfeatures = ctx.saved_tensors
kernel_size = ctx.kernel_size
group_size = ctx.group_size
scale_factor = ctx.scale_factor
rgrad_output = torch.zeros_like(grad_output, requires_grad=False)
rgrad_input_hs = torch.zeros_like(grad_output, requires_grad=False)
rgrad_input = torch.zeros_like(features, requires_grad=False)
rgrad_masks = torch.zeros_like(masks, requires_grad=False)
grad_input = torch.zeros_like(features, requires_grad=False)
grad_masks = torch.zeros_like(masks, requires_grad=False)
ext_module.carafe_backward(
grad_output.contiguous(),
rfeatures,
masks,
rgrad_output,
rgrad_input_hs,
rgrad_input,
rgrad_masks,
grad_input,
grad_masks,
kernel_size=kernel_size,
group_size=group_size,
scale_factor=scale_factor)
return grad_input, grad_masks, None, None, None
carafe = CARAFEFunction.apply
class CARAFE(Module):
""" CARAFE: Content-Aware ReAssembly of FEatures
Please refer to https://arxiv.org/abs/1905.02188 for more details.
Args:
kernel_size (int): reassemble kernel size
group_size (int): reassemble group size
scale_factor (int): upsample ratio
Returns:
upsampled feature map
"""
def __init__(self, kernel_size, group_size, scale_factor):
super(CARAFE, self).__init__()
assert isinstance(kernel_size, int) and isinstance(
group_size, int) and isinstance(scale_factor, int)
self.kernel_size = kernel_size
self.group_size = group_size
self.scale_factor = scale_factor
def forward(self, features, masks):
return carafe(features, masks, self.kernel_size, self.group_size,
self.scale_factor)
@UPSAMPLE_LAYERS.register_module(name='carafe')
class CARAFEPack(nn.Module):
"""A unified package of CARAFE upsampler that contains: 1) channel
compressor 2) content encoder 3) CARAFE op.
Official implementation of ICCV 2019 paper
CARAFE: Content-Aware ReAssembly of FEatures
Please refer to https://arxiv.org/abs/1905.02188 for more details.
Args:
channels (int): input feature channels
scale_factor (int): upsample ratio
up_kernel (int): kernel size of CARAFE op
up_group (int): group size of CARAFE op
encoder_kernel (int): kernel size of content encoder
encoder_dilation (int): dilation of content encoder
compressed_channels (int): output channels of channels compressor
Returns:
upsampled feature map
"""
def __init__(self,
channels,
scale_factor,
up_kernel=5,
up_group=1,
encoder_kernel=3,
encoder_dilation=1,
compressed_channels=64):
super(CARAFEPack, self).__init__()
self.channels = channels
self.scale_factor = scale_factor
self.up_kernel = up_kernel
self.up_group = up_group
self.encoder_kernel = encoder_kernel
self.encoder_dilation = encoder_dilation
self.compressed_channels = compressed_channels
self.channel_compressor = nn.Conv2d(channels, self.compressed_channels,
1)
self.content_encoder = nn.Conv2d(
self.compressed_channels,
self.up_kernel * self.up_kernel * self.up_group *
self.scale_factor * self.scale_factor,
self.encoder_kernel,
padding=int((self.encoder_kernel - 1) * self.encoder_dilation / 2),
dilation=self.encoder_dilation,
groups=1)
self.init_weights()
def init_weights(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
xavier_init(m, distribution='uniform')
normal_init(self.content_encoder, std=0.001)
def kernel_normalizer(self, mask):
mask = F.pixel_shuffle(mask, self.scale_factor)
n, mask_c, h, w = mask.size()
mask_channel = int(mask_c / (self.up_kernel * self.up_kernel))
mask = mask.view(n, mask_channel, -1, h, w)
mask = F.softmax(mask, dim=2)
mask = mask.view(n, mask_c, h, w).contiguous()
return mask
def feature_reassemble(self, x, mask):
x = carafe(x, mask, self.up_kernel, self.up_group, self.scale_factor)
return x
def forward(self, x):
compressed_x = self.channel_compressor(x)
mask = self.content_encoder(compressed_x)
mask = self.kernel_normalizer(mask)
x = self.feature_reassemble(x, mask)
return x
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd.function import once_differentiable
from mmcv.cnn import Scale
from ..utils import ext_loader
ext_module = ext_loader.load_ext(
'_ext', ['ca_forward', 'ca_backward', 'ca_map_forward', 'ca_map_backward'])
class CAWeightFunction(torch.autograd.Function):
@staticmethod
def symbolic(g, t, f):
return g.op('MMCVCAWeight', t, f)
@staticmethod
def forward(ctx, t, f):
n, c, h, w = t.size()
weight = torch.zeros(n, h + w - 1, h, w).to(t.device)
ext_module.ca_forward(t, f, weight)
ctx.save_for_backward(t, f)
return weight
@staticmethod
@once_differentiable
def backward(ctx, dw):
t, f = ctx.saved_tensors
dt = torch.zeros_like(t)
df = torch.zeros_like(f)
ext_module.ca_backward(dw, t, f, dt, df)
return dt, df
class CAMapFunction(torch.autograd.Function):
@staticmethod
def symbolic(g, weight, v):
return g.op('MMCVCAMap', weight, v)
@staticmethod
def forward(ctx, weight, v):
out = torch.zeros_like(v)
ext_module.ca_map_forward(weight, v, out)
ctx.save_for_backward(weight, v)
return out
@staticmethod
@once_differentiable
def backward(ctx, dout):
weight, v = ctx.saved_tensors
dw = torch.zeros_like(weight)
dv = torch.zeros_like(v)
ext_module.ca_map_backward(dout, weight, v, dw, dv)
return dw, dv
ca_weight = CAWeightFunction.apply
ca_map = CAMapFunction.apply
class CrissCrossAttention(nn.Module):
"""Criss-Cross Attention Module."""
def __init__(self, in_channels):
super(CrissCrossAttention, self).__init__()
self.query_conv = nn.Conv2d(in_channels, in_channels // 8, 1)
self.key_conv = nn.Conv2d(in_channels, in_channels // 8, 1)
self.value_conv = nn.Conv2d(in_channels, in_channels, 1)
self.gamma = Scale(0.)
self.in_channels = in_channels
def forward(self, x):
proj_query = self.query_conv(x)
proj_key = self.key_conv(x)
proj_value = self.value_conv(x)
energy = ca_weight(proj_query, proj_key)
attention = F.softmax(energy, 1)
out = ca_map(attention, proj_value)
out = self.gamma(out) + x
return out
def __repr__(self):
s = self.__class__.__name__
s += f'(in_channels={self.in_channels})'
return s
import torch
from torch import nn
from ..cnn import constant_init, kaiming_init
def last_zero_init(m):
if isinstance(m, nn.Sequential):
constant_init(m[-1], val=0)
else:
constant_init(m, val=0)
class ContextBlock(nn.Module):
"""ContextBlock module in GCNet.
See 'GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond'
(https://arxiv.org/abs/1904.11492) for details.
Args:
in_channels (int): Channels of the input feature map.
ratio (float): Ratio of channels of transform bottleneck
pooling_type (str): Pooling method for context modeling
fusion_types (list[str]|tuple[str]): Fusion method for feature fusion,
options: 'channels_add', 'channel_mul'
"""
def __init__(self,
in_channels,
ratio,
pooling_type='att',
fusion_types=('channel_add', )):
super(ContextBlock, self).__init__()
assert pooling_type in ['avg', 'att']
assert isinstance(fusion_types, (list, tuple))
valid_fusion_types = ['channel_add', 'channel_mul']
assert all([f in valid_fusion_types for f in fusion_types])
assert len(fusion_types) > 0, 'at least one fusion should be used'
self.in_channels = in_channels
self.ratio = ratio
self.planes = int(in_channels * ratio)
self.pooling_type = pooling_type
self.fusion_types = fusion_types
if pooling_type == 'att':
self.conv_mask = nn.Conv2d(in_channels, 1, kernel_size=1)
self.softmax = nn.Softmax(dim=2)
else:
self.avg_pool = nn.AdaptiveAvgPool2d(1)
if 'channel_add' in fusion_types:
self.channel_add_conv = nn.Sequential(
nn.Conv2d(self.in_channels, self.planes, kernel_size=1),
nn.LayerNorm([self.planes, 1, 1]),
nn.ReLU(inplace=True), # yapf: disable
nn.Conv2d(self.planes, self.in_channels, kernel_size=1))
else:
self.channel_add_conv = None
if 'channel_mul' in fusion_types:
self.channel_mul_conv = nn.Sequential(
nn.Conv2d(self.in_channels, self.planes, kernel_size=1),
nn.LayerNorm([self.planes, 1, 1]),
nn.ReLU(inplace=True), # yapf: disable
nn.Conv2d(self.planes, self.in_channels, kernel_size=1))
else:
self.channel_mul_conv = None
self.reset_parameters()
def reset_parameters(self):
if self.pooling_type == 'att':
kaiming_init(self.conv_mask, mode='fan_in')
self.conv_mask.inited = True
if self.channel_add_conv is not None:
last_zero_init(self.channel_add_conv)
if self.channel_mul_conv is not None:
last_zero_init(self.channel_mul_conv)
def spatial_pool(self, x):
batch, channel, height, width = x.size()
if self.pooling_type == 'att':
input_x = x
# [N, C, H * W]
input_x = input_x.view(batch, channel, height * width)
# [N, 1, C, H * W]
input_x = input_x.unsqueeze(1)
# [N, 1, H, W]
context_mask = self.conv_mask(x)
# [N, 1, H * W]
context_mask = context_mask.view(batch, 1, height * width)
# [N, 1, H * W]
context_mask = self.softmax(context_mask)
# [N, 1, H * W, 1]
context_mask = context_mask.unsqueeze(-1)
# [N, 1, C, 1]
context = torch.matmul(input_x, context_mask)
# [N, C, 1, 1]
context = context.view(batch, channel, 1, 1)
else:
# [N, C, 1, 1]
context = self.avg_pool(x)
return context
def forward(self, x):
# [N, C, 1, 1]
context = self.spatial_pool(x)
out = x
if self.channel_mul_conv is not None:
# [N, C, 1, 1]
channel_mul_term = torch.sigmoid(self.channel_mul_conv(context))
out = out * channel_mul_term
if self.channel_add_conv is not None:
# [N, C, 1, 1]
channel_add_term = self.channel_add_conv(context)
out = out + channel_add_term
return out
import torch.nn as nn
import torch.nn.functional as F
from ..cnn import CONV_LAYERS
def conv_ws_2d(input,
weight,
bias=None,
stride=1,
padding=0,
dilation=1,
groups=1,
eps=1e-5):
c_in = weight.size(0)
weight_flat = weight.view(c_in, -1)
mean = weight_flat.mean(dim=1, keepdim=True).view(c_in, 1, 1, 1)
std = weight_flat.std(dim=1, keepdim=True).view(c_in, 1, 1, 1)
weight = (weight - mean) / (std + eps)
return F.conv2d(input, weight, bias, stride, padding, dilation, groups)
@CONV_LAYERS.register_module('ConvWS')
class ConvWS2d(nn.Conv2d):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True,
eps=1e-5):
super(ConvWS2d, self).__init__(
in_channels,
out_channels,
kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
groups=groups,
bias=bias)
self.eps = eps
def forward(self, x):
return conv_ws_2d(x, self.weight, self.bias, self.stride, self.padding,
self.dilation, self.groups, self.eps)
from torch import nn
from torch.autograd import Function
from ..utils import ext_loader
ext_module = ext_loader.load_ext('_ext', [
'top_pool_forward', 'top_pool_backward', 'bottom_pool_forward',
'bottom_pool_backward', 'left_pool_forward', 'left_pool_backward',
'right_pool_forward', 'right_pool_backward'
])
class TopPoolFunction(Function):
@staticmethod
def forward(ctx, input):
output = ext_module.top_pool_forward(input)
ctx.save_for_backward(input)
return output
@staticmethod
def backward(ctx, grad_output):
input, = ctx.saved_tensors
output = ext_module.top_pool_backward(input, grad_output)
return output
class BottomPoolFunction(Function):
@staticmethod
def forward(ctx, input):
output = ext_module.bottom_pool_forward(input)
ctx.save_for_backward(input)
return output
@staticmethod
def backward(ctx, grad_output):
input, = ctx.saved_tensors
output = ext_module.bottom_pool_backward(input, grad_output)
return output
class LeftPoolFunction(Function):
@staticmethod
def forward(ctx, input):
output = ext_module.left_pool_forward(input)
ctx.save_for_backward(input)
return output
@staticmethod
def backward(ctx, grad_output):
input, = ctx.saved_tensors
output = ext_module.left_pool_backward(input, grad_output)
return output
class RightPoolFunction(Function):
@staticmethod
def forward(ctx, input):
output = ext_module.right_pool_forward(input)
ctx.save_for_backward(input)
return output
@staticmethod
def backward(ctx, grad_output):
input, = ctx.saved_tensors
output = ext_module.right_pool_backward(input, grad_output)
return output
class CornerPool(nn.Module):
"""Corner Pooling.
Corner Pooling is a new type of pooling layer that helps a
convolutional network better localize corners of bounding boxes.
Please refer to https://arxiv.org/abs/1808.01244 for more details.
Code is modified from https://github.com/princeton-vl/CornerNet-Lite.
Args:
mode(str): Pooling orientation for the pooling layer
- 'bottom': Bottom Pooling
- 'left': Left Pooling
- 'right': Right Pooling
- 'top': Top Pooling
Returns:
Feature map after pooling.
"""
pool_functions = {
'bottom': BottomPoolFunction,
'left': LeftPoolFunction,
'right': RightPoolFunction,
'top': TopPoolFunction,
}
def __init__(self, mode):
super(CornerPool, self).__init__()
assert mode in self.pool_functions
self.corner_pool = self.pool_functions[mode]
def forward(self, x):
return self.corner_pool.apply(x)
#ifndef BBOX_OVERLAPS_CUDA_KERNEL_CUH
#define BBOX_OVERLAPS_CUDA_KERNEL_CUH
template <typename T>
__global__ void bbox_overlaps_cuda_kernel(const T* bbox1, const T* bbox2,
T* ious, const int num_bbox1,
const int num_bbox2, const int mode,
const bool aligned,
const int offset) {
if (aligned) {
CUDA_1D_KERNEL_LOOP(index, num_bbox1) {
int b1 = index;
int b2 = index;
int base1 = b1 * 4;
T b1_x1 = bbox1[base1];
T b1_y1 = bbox1[base1 + 1];
T b1_x2 = bbox1[base1 + 2];
T b1_y2 = bbox1[base1 + 3];
T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset);
int base2 = b2 * 4;
T b2_x1 = bbox2[base2];
T b2_y1 = bbox2[base2 + 1];
T b2_x2 = bbox2[base2 + 2];
T b2_y2 = bbox2[base2 + 3];
T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset);
T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2);
T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2);
T width = fmaxf(right - left + offset, 0.f);
T height = fmaxf(bottom - top + offset, 0.f);
T interS = width * height;
T baseS = 1.0;
if (mode == 0) {
baseS = fmaxf(b1_area + b2_area - interS, T(offset));
} else if (mode == 1) {
baseS = fmaxf(b1_area, T(offset));
}
ious[index] = interS / baseS;
}
} else {
CUDA_1D_KERNEL_LOOP(index, num_bbox1 * num_bbox2) {
int b1 = index / num_bbox2;
int b2 = index % num_bbox2;
int base1 = b1 * 4;
T b1_x1 = bbox1[base1];
T b1_y1 = bbox1[base1 + 1];
T b1_x2 = bbox1[base1 + 2];
T b1_y2 = bbox1[base1 + 3];
T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset);
int base2 = b2 * 4;
T b2_x1 = bbox2[base2];
T b2_y1 = bbox2[base2 + 1];
T b2_x2 = bbox2[base2 + 2];
T b2_y2 = bbox2[base2 + 3];
T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset);
T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2);
T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2);
T width = fmaxf(right - left + offset, 0.f);
T height = fmaxf(bottom - top + offset, 0.f);
T interS = width * height;
T baseS = 1.0;
if (mode == 0) {
baseS = fmaxf(b1_area + b2_area - interS, T(offset));
} else if (mode == 1) {
baseS = fmaxf(b1_area, T(offset));
}
ious[index] = interS / baseS;
}
}
}
#endif
#define WARP_SIZE 32
#define THREADS_PER_PIXEL 32
#define MAX_SHARED_MEMORY 49152
#define MAX_SHARED_SCALAR_T 6144 // 49152 / 8 = 6144
#define MAXIMIZE_KERNEL_SIZE true
#define kTileDim 32
#define kBlockRows 8
#define FULL_MASK 0xffffffff
inline int divideUP(const int x, const int y) { return (((x) + (y)-1) / (y)); }
__device__ inline int Loc2Index(const int n, const int c, const int h,
const int w, const int channel_num,
const int height, const int width) {
int index = w + (h + (c + n * channel_num) * height) * width;
return index;
}
/* TODO: move this to a common place */
template <typename scalar_t>
__device__ inline scalar_t min(scalar_t a, scalar_t b) {
return a < b ? a : b;
}
template <typename scalar_t>
__device__ inline scalar_t max(scalar_t a, scalar_t b) {
return a > b ? a : b;
}
template <typename scalar_t>
__device__ __forceinline__ scalar_t warpReduceSum(scalar_t val) {
for (int offset = 16; offset > 0; offset /= 2)
val += __shfl_down_sync(FULL_MASK, val, offset);
return val;
}
template <>
__device__ __forceinline__ phalf warpReduceSum(phalf val) {
for (int offset = 16; offset > 0; offset /= 2)
__PHALF(val) +=
__shfl_down_sync(FULL_MASK, static_cast<__half>(__PHALF(val)), offset);
return val;
}
// Splits the original matrix into submatrices with size 32 * 32.
// Each block transposes one submatrix by loading it into shared memory.
// Reference https://devblogs.nvidia.com/efficient-matrix-transpose-cuda-cc/
template <typename scalar_t>
__global__ void BatchTranspose2DCUDAKernel(const int N, const int H,
const int W, const int dh,
const int dw,
const scalar_t *__restrict__ X,
scalar_t *__restrict__ Y) {
__shared__ scalar_t tile[kTileDim][kTileDim + 1];
const int n = blockIdx.x / (dh * dw);
const int k = blockIdx.x % (dh * dw);
const int r = k / dw;
const int c = k % dw;
const int offset = n * H * W;
int x = c * kTileDim + threadIdx.x;
int y = r * kTileDim + threadIdx.y;
if (x < W) {
for (int i = 0; threadIdx.y + i < kTileDim && y + i < H; i += kBlockRows) {
tile[threadIdx.y + i][threadIdx.x] = X[offset + (y + i) * W + x];
}
}
__syncthreads();
x = r * kTileDim + threadIdx.x;
y = c * kTileDim + threadIdx.y;
if (x < H) {
for (int i = 0; threadIdx.y + i < kTileDim && y + i < W; i += kBlockRows) {
Y[offset + (y + i) * H + x] = tile[threadIdx.x][threadIdx.y + i];
}
}
}
template <typename scalar_t>
__global__ void CARAFEForward(
const int num_kernels, const scalar_t *__restrict__ bottom_data,
const scalar_t *__restrict__ bottom_masks, const int kernel_size,
const int group_size, const int scale_factor, const int channels,
const int down_height, const int down_width, const int height,
const int width, const int mask_channels, scalar_t *__restrict__ top_data) {
#if MAXIMIZE_KERNEL_SIZE
__shared__ float shared_mask[MAX_SHARED_SCALAR_T * 2];
#else
__shared__ scalar_t shared_mask[MAX_SHARED_SCALAR_T];
#endif
int index = threadIdx.x + blockIdx.x * blockDim.x;
if (index > num_kernels - 1) {
return;
}
const int pixel_id = threadIdx.x / THREADS_PER_PIXEL;
const int split_id = threadIdx.x % THREADS_PER_PIXEL;
index = index / THREADS_PER_PIXEL;
const int pw = index % width;
const int ph = (index / width) % height;
const int n = index / width / height;
const int down_pw = pw / scale_factor;
const int down_ph = ph / scale_factor;
const int start_w = down_pw - (kernel_size - 1) / 2;
const int end_w = down_pw + (kernel_size - 1) / 2 + 1;
const int start_h = down_ph - (kernel_size - 1) / 2;
const int end_h = down_ph + (kernel_size - 1) / 2 + 1;
for (int c = split_id; c < mask_channels; c += THREADS_PER_PIXEL) {
int mask_index = Loc2Index(n, ph, pw, c, height, width, mask_channels);
shared_mask[c * WARP_SIZE + pixel_id] = bottom_masks[mask_index];
}
__syncthreads();
const int channels_per_group = ceilf(channels / (float)group_size);
#pragma unroll
for (int c = split_id; c < channels; c += THREADS_PER_PIXEL) {
int mask_group = c / channels_per_group;
scalar_t output_val = 0;
#pragma unroll
for (int iy = start_h; iy < end_h; iy++) {
#pragma unroll
for (int ix = start_w; ix < end_w; ix++) {
if (iy < 0 || iy > down_height - 1 || ix < 0 || ix > down_width - 1) {
continue;
}
int mask_iy = iy - down_ph + (kernel_size - 1) / 2;
int mask_ix = ix - down_pw + (kernel_size - 1) / 2;
int mask_c =
(mask_group * kernel_size + mask_iy) * kernel_size + mask_ix;
int feat_index =
Loc2Index(n, iy, ix, c, down_height, down_width, channels);
output_val += bottom_data[feat_index] *
shared_mask[mask_c * WARP_SIZE + pixel_id];
}
}
int top_index = Loc2Index(n, ph, pw, c, height, width, channels);
top_data[top_index] = output_val;
}
}
template <typename scalar_t>
__global__ void CARAFEBackward_Feature(
const int num_kernels, const scalar_t *__restrict__ top_diff,
const scalar_t *__restrict__ bottom_masks, const int kernel_size,
const int group_size, const int scale_factor, const int channels,
const int down_height, const int down_width, const int height,
const int width, const int mask_channels,
scalar_t *__restrict__ bottom_diff) {
#if MAXIMIZE_KERNEL_SIZE
__shared__ float shared_mask[MAX_SHARED_SCALAR_T * 2];
#else
__shared__ scalar_t shared_mask[MAX_SHARED_SCALAR_T];
#endif
int index = threadIdx.x + blockIdx.x * blockDim.x;
if (index > num_kernels - 1) {
return;
}
const int pixel_id = threadIdx.x / THREADS_PER_PIXEL;
const int split_id = threadIdx.x % THREADS_PER_PIXEL;
// (n, c, ph, pw) is an element in the bottom_data
index = index / THREADS_PER_PIXEL;
const int pw = index % width;
const int ph = (index / width) % height;
const int n = index / width / height;
const int start_w = pw - (kernel_size - 1) * scale_factor / 2;
const int end_w = pw + (kernel_size - 1) * scale_factor / 2 + 1;
const int start_h = ph - (kernel_size - 1) * scale_factor / 2;
const int end_h = ph + (kernel_size - 1) * scale_factor / 2 + 1;
for (int c = split_id; c < mask_channels; c += THREADS_PER_PIXEL) {
const int mask_w = (c % kernel_size) * scale_factor;
const int mask_h = (c / kernel_size % kernel_size) * scale_factor;
const int mask_x = start_w + mask_w;
const int mask_y = start_h + mask_h;
if (mask_y < 0 || mask_y > height - 1 || mask_x < 0 || mask_x > width - 1) {
shared_mask[c * WARP_SIZE + pixel_id] = 0;
continue;
}
const int mask_group = c / (kernel_size * kernel_size);
const int mask_c = (2 * mask_group + 1) * kernel_size * kernel_size - c - 1;
int mask_index =
Loc2Index(n, mask_c, mask_y, mask_x, mask_channels, height, width);
shared_mask[c * WARP_SIZE + pixel_id] = bottom_masks[mask_index];
}
__syncthreads();
const int channels_per_group = ceilf(channels / (float)group_size);
#pragma unroll
for (int c = split_id; c < channels; c += THREADS_PER_PIXEL) {
int mask_group = c / channels_per_group;
int top_index = Loc2Index(n, ph, pw, c, height, width, channels);
scalar_t output_val = 0;
#pragma unroll
for (int iy = start_h; iy < end_h; iy += scale_factor) {
#pragma unroll
for (int ix = start_w; ix < end_w; ix += scale_factor) {
if (iy < 0 || iy > height - 1 || ix < 0 || ix > width - 1) {
continue;
}
int mask_iy =
(iy - ph + (kernel_size - 1) * scale_factor / 2) / scale_factor;
int mask_ix =
(ix - pw + (kernel_size - 1) * scale_factor / 2) / scale_factor;
int mask_c =
(mask_group * kernel_size + mask_iy) * kernel_size + mask_ix;
int feat_index = Loc2Index(n, iy, ix, c, height, width, channels);
output_val +=
shared_mask[mask_c * WARP_SIZE + pixel_id] * top_diff[feat_index];
}
}
bottom_diff[top_index] = output_val;
}
}
template <typename scalar_t>
__global__ void FeatureSum(const int num_kernels,
const scalar_t *__restrict__ input_data,
const int scale_factor, const int channels,
const int height, const int width,
scalar_t *__restrict__ output_data) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
if (index > num_kernels - 1) {
return;
}
const int split_id = threadIdx.x % THREADS_PER_PIXEL;
index = index / THREADS_PER_PIXEL;
const int pw = index % width;
const int ph = (index / width) % height;
const int n = index / width / height;
for (int c = split_id; c < channels; c += THREADS_PER_PIXEL) {
scalar_t output_val = 0;
for (int iy = ph * scale_factor; iy < (ph + 1) * scale_factor; iy++) {
for (int ix = pw * scale_factor; ix < (pw + 1) * scale_factor; ix++) {
int input_id = Loc2Index(n, iy, ix, c, height * scale_factor,
width * scale_factor, channels);
output_val += input_data[input_id];
}
}
const int output_id = Loc2Index(n, ph, pw, c, height, width, channels);
output_data[output_id] = output_val;
}
}
template <typename scalar_t>
__global__ void CARAFEBackward_Mask(const int num_kernels,
const scalar_t *__restrict__ top_diff,
const scalar_t *__restrict__ bottom_data,
const int kernel_size, const int group_size,
const int scale_factor, const int channels,
const int down_height, const int down_width,
const int height, const int width,
const int mask_channels,
scalar_t *__restrict__ mask_diff) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
if (index > num_kernels - 1) {
return;
}
const int lane_id = index % WARP_SIZE;
index = index / WARP_SIZE;
const int mask_c = index % mask_channels;
// (n, c, ph, pw) is an element in the bottom_data
index = index / mask_channels;
const int pw = index % width;
const int ph = (index / width) % height;
const int n = index / width / height;
const int down_pw = pw / scale_factor;
const int down_ph = ph / scale_factor;
const int mask_group = mask_c / (kernel_size * kernel_size);
const int mask_loc = mask_c % (kernel_size * kernel_size);
const int offset_x = mask_loc % kernel_size - (kernel_size - 1) / 2;
const int offset_y =
mask_loc / kernel_size % kernel_size - (kernel_size - 1) / 2;
const int down_x = down_pw + offset_x;
const int down_y = down_ph + offset_y;
scalar_t output_val = 0;
if (down_y >= 0 && down_y <= down_height - 1 && down_x >= 0 &&
down_x <= down_width - 1) {
const int channels_per_mask = ceilf(channels / (float)group_size);
const int start = channels_per_mask * mask_group;
const int end = min(channels_per_mask * (mask_group + 1), channels);
for (int c = start + lane_id; c < end; c += WARP_SIZE) {
int bottom_id =
Loc2Index(n, down_y, down_x, c, down_height, down_width, channels);
int top_id = Loc2Index(n, ph, pw, c, height, width, channels);
output_val += top_diff[top_id] * bottom_data[bottom_id];
}
}
__syncwarp();
output_val = warpReduceSum(output_val);
if (lane_id == 0) {
const int mask_id =
Loc2Index(n, ph, pw, mask_c, height, width, mask_channels);
mask_diff[mask_id] = output_val;
}
}
#ifndef CARAFE_NAIVE_CUDA_KERNEL_CUH
#define CARAFE_NAIVE_CUDA_KERNEL_CUH
__device__ inline int Loc2Index(const int n, const int c, const int h,
const int w, const int channel_num,
const int height, const int width) {
int index = w + (h + (c + n * channel_num) * height) * width;
return index;
}
template <typename scalar_t>
__global__ void carafe_naive_forward_cuda_kernel(
const int nthreads, const scalar_t *bottom_data,
const scalar_t *bottom_masks, scalar_t *top_data, const int kernel_size,
const int group_size, const int scale_factor, const int channels,
const int height, const int width) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the bottom_data
int pw = index % width;
int ph = (index / width) % height;
int c = (index / width / height) % channels;
int n = index / width / height / channels;
int mask_channels = kernel_size * kernel_size * group_size;
int mask_group = c / (channels / group_size);
int down_pw = pw / scale_factor;
int down_ph = ph / scale_factor;
int down_width = width / scale_factor;
int down_height = height / scale_factor;
int start_w = down_pw - (kernel_size - 1) / 2;
int end_w = down_pw + (kernel_size - 1) / 2 + 1;
int start_h = down_ph - (kernel_size - 1) / 2;
int end_h = down_ph + (kernel_size - 1) / 2 + 1;
scalar_t output_val = 0;
for (int iy = start_h; iy < end_h; iy++) {
for (int ix = start_w; ix < end_w; ix++) {
if (iy < 0 || iy > down_height - 1 || ix < 0 || ix > down_width - 1) {
continue;
}
int mask_iy = iy - down_ph + (kernel_size - 1) / 2;
int mask_ix = ix - down_pw + (kernel_size - 1) / 2;
int mask_c =
(mask_group * kernel_size + mask_iy) * kernel_size + mask_ix;
int feat_index =
Loc2Index(n, c, iy, ix, channels, down_height, down_width);
int mask_index =
Loc2Index(n, mask_c, ph, pw, mask_channels, height, width);
output_val += bottom_data[feat_index] * bottom_masks[mask_index];
}
}
top_data[index] = output_val;
}
}
template <typename scalar_t>
__global__ void carafe_naive_backward_cuda_kernel(
const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_data,
const scalar_t *bottom_masks, scalar_t *bottom_diff, scalar_t *mask_diff,
const int kernel_size, const int group_size, const int scale_factor,
const int channels, const int height, const int width) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the bottom_data
int pw = index % width;
int ph = (index / width) % height;
int c = (index / width / height) % channels;
int n = index / width / height / channels;
int mask_channels = kernel_size * kernel_size * group_size;
int mask_group = c / (channels / group_size);
int down_pw = pw / scale_factor;
int down_ph = ph / scale_factor;
int down_width = width / scale_factor;
int down_height = height / scale_factor;
int start_w = down_pw - (kernel_size - 1) / 2;
int end_w = down_pw + (kernel_size - 1) / 2 + 1;
int start_h = down_ph - (kernel_size - 1) / 2;
int end_h = down_ph + (kernel_size - 1) / 2 + 1;
for (int iy = start_h; iy < end_h; iy++) {
for (int ix = start_w; ix < end_w; ix++) {
if (iy < 0 || iy > down_height - 1 || ix < 0 || ix > down_width - 1) {
continue;
}
int mask_iy = iy - down_ph + (kernel_size - 1) / 2;
int mask_ix = ix - down_pw + (kernel_size - 1) / 2;
int mask_c =
(mask_group * kernel_size + mask_iy) * kernel_size + mask_ix;
int feat_index =
Loc2Index(n, c, iy, ix, channels, down_height, down_width);
int mask_index =
Loc2Index(n, mask_c, ph, pw, mask_channels, height, width);
atomicAdd(bottom_diff + feat_index,
bottom_masks[mask_index] * top_diff[index]);
atomicAdd(mask_diff + mask_index,
bottom_data[feat_index] * top_diff[index]);
}
}
}
}
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment