Unverified Commit 0b4285d9 authored by Zaida Zhou's avatar Zaida Zhou Committed by GitHub
Browse files

Pick commits from master (#2164)



* [Docs] Add swith_language.md in docs (#2160)

* [Fix] Fix onnx unit tests (#2155)

* [Docs] Limit extension versions (#2144)

* Support PrRoIPool operation

* Add MPS bbox overlap

* Add .pre-commit-config-zh-cn.yaml (#2135)
Co-authored-by: default avatarxcnick <xcnick0412@gmail.com>
Co-authored-by: default avatarJingwei Zhang <zjw18@mails.tsinghua.edu.cn>
Co-authored-by: default avatarq.yao <yaoqian@sensetime.com>
parent 47a61c3b
exclude: ^tests/data/
repos:
- repo: https://gitee.com/openmmlab/mirrors-flake8
rev: 3.8.3
hooks:
- id: flake8
- repo: https://gitee.com/openmmlab/mirrors-isort
rev: 5.10.1
hooks:
- id: isort
- repo: https://gitee.com/openmmlab/mirrors-yapf
rev: v0.30.0
hooks:
- id: yapf
- repo: https://gitee.com/openmmlab/mirrors-pre-commit-hooks
rev: v3.1.0
hooks:
- id: trailing-whitespace
- id: check-yaml
- id: end-of-file-fixer
- id: requirements-txt-fixer
- id: double-quote-string-fixer
- id: check-merge-conflict
- id: fix-encoding-pragma
args: ["--remove"]
- id: mixed-line-ending
args: ["--fix=lf"]
- repo: https://gitee.com/openmmlab/mirrors-codespell
rev: v2.1.0
hooks:
- id: codespell
- repo: https://gitee.com/openmmlab/mirrors-mdformat
rev: 0.7.9
hooks:
- id: mdformat
args: ["--number"]
additional_dependencies:
- mdformat-openmmlab
- mdformat_frontmatter
- linkify-it-py
- repo: https://gitee.com/openmmlab/mirrors-docformatter
rev: v1.3.1
hooks:
- id: docformatter
args: ["--in-place", "--wrap-descriptions", "79"]
- repo: https://github.com/asottile/pyupgrade
rev: v2.32.1
hooks:
- id: pyupgrade
args: ["--py36-plus"]
- repo: https://gitee.com/openmmlab/pre-commit-hooks
rev: v0.2.0 # Use the ref you want to point at
hooks:
- id: check-copyright
args: ["mmcv", "tests", "--excludes", "mmcv/ops"]
- repo: https://gitee.com/openmmlab/mirrors-mypy
rev: v0.812
hooks:
- id: mypy
exclude: |-
(?x)(
^test
| ^docs
)
# - repo: local
# hooks:
# - id: clang-format
# name: clang-format
# description: Format files with ClangFormat
# entry: clang-format -style=google -i
# language: system
# files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|cuh|proto)$
...@@ -3,4 +3,5 @@ include mmcv/model_zoo/open_mmlab.json mmcv/model_zoo/deprecated.json mmcv/model ...@@ -3,4 +3,5 @@ include mmcv/model_zoo/open_mmlab.json mmcv/model_zoo/deprecated.json mmcv/model
include mmcv/ops/csrc/common/cuda/*.cuh mmcv/ops/csrc/common/cuda/*.hpp mmcv/ops/csrc/common/*.hpp include mmcv/ops/csrc/common/cuda/*.cuh mmcv/ops/csrc/common/cuda/*.hpp mmcv/ops/csrc/common/*.hpp
include mmcv/ops/csrc/pytorch/*.cpp mmcv/ops/csrc/pytorch/cuda/*.cu mmcv/ops/csrc/pytorch/cuda/*.cpp mmcv/ops/csrc/pytorch/cpu/*.cpp include mmcv/ops/csrc/pytorch/*.cpp mmcv/ops/csrc/pytorch/cuda/*.cu mmcv/ops/csrc/pytorch/cuda/*.cpp mmcv/ops/csrc/pytorch/cpu/*.cpp
include mmcv/ops/csrc/parrots/*.h mmcv/ops/csrc/parrots/*.cpp include mmcv/ops/csrc/parrots/*.h mmcv/ops/csrc/parrots/*.cpp
recursive-include mmcv/ops/csrc/ *.h *.hpp *.cpp *.cuh *.cu include mmcv/ops/csrc/pytorch/mps/*.mm mmcv/ops/csrc/common/mps/*.h mmcv/ops/csrc/common/mps/*.mm
recursive-include mmcv/ops/csrc/ *.h *.hpp *.cpp *.cuh *.cu *.mm
...@@ -36,6 +36,11 @@ You can switch between Chinese and English documents in the lower-left corner of ...@@ -36,6 +36,11 @@ You can switch between Chinese and English documents in the lower-left corner of
deployment/tensorrt_custom_ops.md deployment/tensorrt_custom_ops.md
deployment/tensorrt_plugin.md deployment/tensorrt_plugin.md
.. toctree::
:caption: Switch Language
switch_language.md
.. toctree:: .. toctree::
:maxdepth: 2 :maxdepth: 2
:caption: Compatibility :caption: Compatibility
......
## <a href='https://mmcv.readthedocs.io/en/latest/'>English</a>
## <a href='https://mmcv.readthedocs.io/zh_CN/latest/'>简体中文</a>
...@@ -2,58 +2,59 @@ ...@@ -2,58 +2,59 @@
We implement common ops used in detection, segmentation, etc. We implement common ops used in detection, segmentation, etc.
| Device | CPU | CUDA | MLU | | Device | CPU | CUDA | MLU | MPS |
| ---------------------------- | --- | ---- | --- | | ---------------------------- | --- | ---- | --- | --- |
| ActiveRotatedFilter | √ | √ | | | ActiveRotatedFilter | √ | √ | | |
| AssignScoreWithK | | √ | | | AssignScoreWithK | | √ | | |
| BallQuery | | √ | | | BallQuery | | √ | | |
| BBoxOverlaps | | √ | √ | | BBoxOverlaps | | √ | √ | √ |
| BorderAlign | | √ | | | BorderAlign | | √ | | |
| BoxIouRotated | √ | √ | | | BoxIouRotated | √ | √ | | |
| CARAFE | | √ | | | CARAFE | | √ | | |
| ChamferDistance | | √ | | | ChamferDistance | | √ | | |
| CrissCrossAttention | | √ | | | CrissCrossAttention | | √ | | |
| ContourExpand | √ | | | | ContourExpand | √ | | | |
| ConvexIoU | | √ | | | ConvexIoU | | √ | | |
| CornerPool | | √ | | | CornerPool | | √ | | |
| Correlation | | √ | | | Correlation | | √ | | |
| Deformable Convolution v1/v2 | √ | √ | | | Deformable Convolution v1/v2 | √ | √ | | |
| Deformable RoIPool | | √ | | | Deformable RoIPool | | √ | | |
| DiffIoURotated | | √ | | | DiffIoURotated | | √ | | |
| DynamicScatter | | √ | | | DynamicScatter | | √ | | |
| FurthestPointSample | | √ | | | FurthestPointSample | | √ | | |
| FurthestPointSampleWithDist | | √ | | | FurthestPointSampleWithDist | | √ | | |
| FusedBiasLeakyrelu | | √ | | | FusedBiasLeakyrelu | | √ | | |
| GatherPoints | | √ | | | GatherPoints | | √ | | |
| GroupPoints | | √ | | | GroupPoints | | √ | | |
| Iou3d | | √ | | | Iou3d | | √ | | |
| KNN | | √ | | | KNN | | √ | | |
| MaskedConv | | √ | | | MaskedConv | | √ | | |
| MergeCells | | √ | | | MergeCells | | √ | | |
| MinAreaPolygon | | √ | | | MinAreaPolygon | | √ | | |
| ModulatedDeformConv2d | √ | √ | | | ModulatedDeformConv2d | √ | √ | | |
| MultiScaleDeformableAttn | | √ | | | MultiScaleDeformableAttn | | √ | | |
| NMS | √ | √ | √ | | NMS | √ | √ | √ | |
| NMSRotated | √ | √ | | | NMSRotated | √ | √ | | |
| PixelGroup | √ | | | | PixelGroup | √ | | | |
| PointsInBoxes | √ | √ | | | PointsInBoxes | √ | √ | | |
| PointsInPolygons | | √ | | | PointsInPolygons | | √ | | |
| PSAMask | √ | √ | √ | | PSAMask | √ | √ | √ | |
| RotatedFeatureAlign | √ | √ | | | RotatedFeatureAlign | √ | √ | | |
| RoIPointPool3d | | √ | | | RoIPointPool3d | | √ | | |
| RoIPool | | √ | √ | | RoIPool | | √ | √ | |
| RoIAlignRotated | √ | √ | √ | | RoIAlignRotated | √ | √ | √ | |
| RiRoIAlignRotated | | √ | | | RiRoIAlignRotated | | √ | | |
| RoIAlign | √ | √ | √ | | RoIAlign | √ | √ | √ | |
| RoIAwarePool3d | | √ | | | RoIAwarePool3d | | √ | | |
| SAConv2d | | √ | | | SAConv2d | | √ | | |
| SigmoidFocalLoss | | √ | √ | | SigmoidFocalLoss | | √ | √ | |
| SoftmaxFocalLoss | | √ | | | SoftmaxFocalLoss | | √ | | |
| SoftNMS | | √ | | | SoftNMS | | √ | | |
| Sparse Convolution | | √ | | | Sparse Convolution | | √ | | |
| Synchronized BatchNorm | | √ | | | Synchronized BatchNorm | | √ | | |
| ThreeInterpolate | | √ | | | ThreeInterpolate | | √ | | |
| ThreeNN | | √ | | | ThreeNN | | √ | | |
| TINShift | | √ | √ | | TINShift | | √ | √ | |
| UpFirDn2d | | √ | | | UpFirDn2d | | √ | | |
| Voxelization | √ | √ | | | Voxelization | √ | √ | | |
| PrRoIPool | | √ | | |
...@@ -36,6 +36,11 @@ ...@@ -36,6 +36,11 @@
deployment/tensorrt_plugin.md deployment/tensorrt_plugin.md
deployment/tensorrt_custom_ops.md deployment/tensorrt_custom_ops.md
.. toctree::
:caption: 语言切换
switch_language.md
.. toctree:: .. toctree::
:maxdepth: 2 :maxdepth: 2
:caption: 兼容性 :caption: 兼容性
......
## <a href='https://mmcv.readthedocs.io/en/latest/'>English</a>
## <a href='https://mmcv.readthedocs.io/zh_CN/latest/'>简体中文</a>
...@@ -2,58 +2,59 @@ ...@@ -2,58 +2,59 @@
MMCV 提供了检测、分割等任务中常用的算子 MMCV 提供了检测、分割等任务中常用的算子
| Device | CPU | CUDA | MLU | | Device | CPU | CUDA | MLU | MPS |
| ---------------------------- | --- | ---- | --- | | ---------------------------- | --- | ---- | --- | --- |
| ActiveRotatedFilter | √ | √ | | | ActiveRotatedFilter | √ | √ | | |
| AssignScoreWithK | | √ | | | AssignScoreWithK | | √ | | |
| BallQuery | | √ | | | BallQuery | | √ | | |
| BBoxOverlaps | | √ | √ | | BBoxOverlaps | | √ | √ | √ |
| BorderAlign | | √ | | | BorderAlign | | √ | | |
| BoxIouRotated | √ | √ | | | BoxIouRotated | √ | √ | | |
| CARAFE | | √ | | | CARAFE | | √ | | |
| ChamferDistance | | √ | | | ChamferDistance | | √ | | |
| CrissCrossAttention | | √ | | | CrissCrossAttention | | √ | | |
| ContourExpand | √ | | | | ContourExpand | √ | | | |
| ConvexIoU | | √ | | | ConvexIoU | | √ | | |
| CornerPool | | √ | | | CornerPool | | √ | | |
| Correlation | | √ | | | Correlation | | √ | | |
| Deformable Convolution v1/v2 | √ | √ | | | Deformable Convolution v1/v2 | √ | √ | | |
| Deformable RoIPool | | √ | | | Deformable RoIPool | | √ | | |
| DiffIoURotated | | √ | | | DiffIoURotated | | √ | | |
| DynamicScatter | | √ | | | DynamicScatter | | √ | | |
| FurthestPointSample | | √ | | | FurthestPointSample | | √ | | |
| FurthestPointSampleWithDist | | √ | | | FurthestPointSampleWithDist | | √ | | |
| FusedBiasLeakyrelu | | √ | | | FusedBiasLeakyrelu | | √ | | |
| GatherPoints | | √ | | | GatherPoints | | √ | | |
| GroupPoints | | √ | | | GroupPoints | | √ | | |
| Iou3d | | √ | | | Iou3d | | √ | | |
| KNN | | √ | | | KNN | | √ | | |
| MaskedConv | | √ | | | MaskedConv | | √ | | |
| MergeCells | | √ | | | MergeCells | | √ | | |
| MinAreaPolygon | | √ | | | MinAreaPolygon | | √ | | |
| ModulatedDeformConv2d | √ | √ | | | ModulatedDeformConv2d | √ | √ | | |
| MultiScaleDeformableAttn | | √ | | | MultiScaleDeformableAttn | | √ | | |
| NMS | √ | √ | √ | | NMS | √ | √ | √ | |
| NMSRotated | √ | √ | | | NMSRotated | √ | √ | | |
| PixelGroup | √ | | | | PixelGroup | √ | | | |
| PointsInBoxes | √ | √ | | | PointsInBoxes | √ | √ | | |
| PointsInPolygons | | √ | | | PointsInPolygons | | √ | | |
| PSAMask | √ | √ | √ | | PSAMask | √ | √ | √ | |
| RotatedFeatureAlign | √ | √ | | | RotatedFeatureAlign | √ | √ | | |
| RoIPointPool3d | | √ | | | RoIPointPool3d | | √ | | |
| RoIPool | | √ | √ | | RoIPool | | √ | √ | |
| RoIAlignRotated | √ | √ | √ | | RoIAlignRotated | √ | √ | √ | |
| RiRoIAlignRotated | | √ | | | RiRoIAlignRotated | | √ | | |
| RoIAlign | √ | √ | √ | | RoIAlign | √ | √ | √ | |
| RoIAwarePool3d | | √ | | | RoIAwarePool3d | | √ | | |
| SAConv2d | | √ | | | SAConv2d | | √ | | |
| SigmoidFocalLoss | | √ | √ | | SigmoidFocalLoss | | √ | √ | |
| SoftmaxFocalLoss | | √ | | | SoftmaxFocalLoss | | √ | | |
| SoftNMS | | √ | | | SoftNMS | | √ | | |
| Sparse Convolution | | √ | | | Sparse Convolution | | √ | | |
| Synchronized BatchNorm | | √ | | | Synchronized BatchNorm | | √ | | |
| ThreeInterpolate | | √ | | | ThreeInterpolate | | √ | | |
| ThreeNN | | √ | | | ThreeNN | | √ | | |
| TINShift | | √ | √ | | TINShift | | √ | √ | |
| UpFirDn2d | | √ | | | UpFirDn2d | | √ | | |
| Voxelization | √ | √ | | | Voxelization | √ | √ | | |
| PrRoIPool | | √ | | |
...@@ -46,6 +46,7 @@ from .points_in_boxes import (points_in_boxes_all, points_in_boxes_cpu, ...@@ -46,6 +46,7 @@ from .points_in_boxes import (points_in_boxes_all, points_in_boxes_cpu,
points_in_boxes_part) points_in_boxes_part)
from .points_in_polygons import points_in_polygons from .points_in_polygons import points_in_polygons
from .points_sampler import PointsSampler from .points_sampler import PointsSampler
from .prroi_pool import PrRoIPool, prroi_pool
from .psa_mask import PSAMask from .psa_mask import PSAMask
from .riroi_align_rotated import RiRoIAlignRotated, riroi_align_rotated from .riroi_align_rotated import RiRoIAlignRotated, riroi_align_rotated
from .roi_align import RoIAlign, roi_align from .roi_align import RoIAlign, roi_align
...@@ -100,5 +101,6 @@ __all__ = [ ...@@ -100,5 +101,6 @@ __all__ = [
'SparseConvTensor', 'scatter_nd', 'points_in_boxes_part', 'SparseConvTensor', 'scatter_nd', 'points_in_boxes_part',
'points_in_boxes_cpu', 'points_in_boxes_all', 'points_in_polygons', 'points_in_boxes_cpu', 'points_in_boxes_all', 'points_in_polygons',
'min_area_polygons', 'active_rotated_filter', 'convex_iou', 'convex_giou', 'min_area_polygons', 'active_rotated_filter', 'convex_iou', 'convex_giou',
'diff_iou_rotated_2d', 'diff_iou_rotated_3d', 'chamfer_distance' 'diff_iou_rotated_2d', 'diff_iou_rotated_3d', 'chamfer_distance',
'PrRoIPool', 'prroi_pool'
] ]
...@@ -13,11 +13,19 @@ This folder contains all non-python code for MMCV custom ops. Please follow the ...@@ -13,11 +13,19 @@ This folder contains all non-python code for MMCV custom ops. Please follow the
│ ├── pytorch_cpp_helper.hpp │ ├── pytorch_cpp_helper.hpp
│ ├── pytorch_cuda_helper.hpp │ ├── pytorch_cuda_helper.hpp
│ ├── pytorch_device_registry.hpp │ ├── pytorch_device_registry.hpp
│   └── cuda │   ├── cuda
│   ├── common_cuda_helper.hpp │   │ ├── common_cuda_helper.hpp
│   ├── parrots_cudawarpfunction.cuh │   │ ├── parrots_cudawarpfunction.cuh
│   ├── ... │   │ ├── ...
│   └── ops_cuda_kernel.cuh │   │ └── ops_cuda_kernel.cuh
|   ├── mps
│   │ ├── MPSLibrary.h
│   │ ├── ...
│   │ └── MPSUtils.h
|   ├── mlu
│   │ └── ...
|   └── utils
│   │ └── ...
├── onnxruntime ├── onnxruntime
│   ├── onnxruntime_register.h │   ├── onnxruntime_register.h
│   ├── onnxruntime_session_options_config_keys.h │   ├── onnxruntime_session_options_config_keys.h
...@@ -41,9 +49,15 @@ This folder contains all non-python code for MMCV custom ops. Please follow the ...@@ -41,9 +49,15 @@ This folder contains all non-python code for MMCV custom ops. Please follow the
│   ├── cuda │   ├── cuda
│   │   ├── ... │   │   ├── ...
│   │   └── ops_cuda.cu │   │   └── ops_cuda.cu
│   └── cpu │   ├── cpu
│   │   ├── ...
│   │   └── ops.cpp
│   ├── mps
│   │   ├── ...
│   |   └── op_mps.mm
│   └── mlu
│      ├── ... │      ├── ...
│      └── ops.cpp │      └── op_mlu.cpp
└── tensorrt └── tensorrt
├── trt_cuda_helper.cuh ├── trt_cuda_helper.cuh
├── trt_plugin_helper.hpp ├── trt_plugin_helper.hpp
...@@ -63,13 +77,18 @@ This folder contains all non-python code for MMCV custom ops. Please follow the ...@@ -63,13 +77,18 @@ This folder contains all non-python code for MMCV custom ops. Please follow the
- `common`: This directory contains all tools and shared codes. - `common`: This directory contains all tools and shared codes.
- `cuda`: The cuda kernels which can be shared by all backends. **HIP** kernel is also here since they have similar syntax. - `cuda`: The cuda kernels which can be shared by all backends. **HIP** kernel is also here since they have similar syntax.
- `onnxruntime`: **ONNX Runtime** support for custom ops. - `mps`: The tools used to support MPS ops. **NOTE** that MPS support is **experimental**.
- `mlu`: The MLU kernels used to support [Cambricon](https://www.cambricon.com/) device.
- `utils`: The kernels and utils of spconv.
- `onnxruntime`: **ONNX Runtime** support for custom ops. Has been deprecated, please try the latest custom ops in [MMDeploy](https://github.com/open-mmlab/mmdeploy).
- `cpu`: CPU implementation of supported ops. - `cpu`: CPU implementation of supported ops.
- `parrots`: **Parrots** is a deep learning frame for model training and inference. Parrots custom ops are placed in this directory. - `parrots`: **Parrots** is a deep learning frame for model training and inference. Parrots custom ops are placed in this directory.
- `pytorch`: **PyTorch** custom ops are supported by binding C++ to Python with **pybind11**. The ops implementation and binding codes are placed in this directory. - `pytorch`: **PyTorch** custom ops are supported by binding C++ to Python with **pybind11**. The ops implementation and binding codes are placed in this directory.
- `cuda`: This directory contains cuda kernel launchers, which feed memory pointers of tensor to the cuda kernel in `common/cuda`. The launchers provide c++ interface of cuda implementation of corresponding custom ops. - `cuda`: This directory contains cuda kernel launchers, which feed memory pointers of tensor to the cuda kernel in `common/cuda`. The launchers provide c++ interface of cuda implementation of corresponding custom ops.
- `cpu`: This directory contain cpu implementations of corresponding custom ops. - `cpu`: This directory contain cpu implementations of corresponding custom ops.
- `tensorrt`: **TensorRT** support for custom ops. - `mlu`: This directory contain launchers of each MLU kernels.
- `mps`: MPS ops implementation and launchers.
- `tensorrt`: **TensorRT** support for custom ops. Has been deprecated, please try the latest custom ops in [MMDeploy](https://github.com/open-mmlab/mmdeploy).
- `plugins`: This directory contains the implementation of the supported custom ops. Some ops might also use shared cuda kernel in `common/cuda`. - `plugins`: This directory contains the implementation of the supported custom ops. Some ops might also use shared cuda kernel in `common/cuda`.
## How to add new PyTorch ops? ## How to add new PyTorch ops?
......
// Copyright (c) OpenMMLab. All rights reserved
// Modified from
// https://github.com/vacancy/PreciseRoIPooling/blob/master/src/prroi_pooling_gpu_impl.cu
// Distributed under terms of the MIT license.
#ifndef PRROI_POOL_CUDA_KERNEL_CUH
#define PRROI_POOL_CUDA_KERNEL_CUH
#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif
template <typename T>
__device__ static __forceinline__ T PrRoIPoolingGetData(const T *data,
const int h,
const int w,
const int height,
const int width) {
bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width);
T retVal = overflow ? 0.0f : data[h * width + w];
return retVal;
}
template <typename T>
__device__ static __forceinline__ T PrRoIPoolingGetCoeff(T dh, T dw) {
return (1.0f - abs(dh)) * (1.0f - abs(dw));
}
template <typename T>
__device__ static __forceinline__ T PrRoIPoolingSingleCoorIntegral(T s, T t,
T c1, T c2) {
return 0.5 * (t * t - s * s) * (c2 - c1) + (t - s) * c1;
}
template <typename T>
__device__ static T PrRoIPoolingInterpolation(const T *data, const T h,
const T w, const int height,
const int width) {
T retVal = 0.0f;
int h1 = floorf(h);
int w1 = floorf(w);
retVal += PrRoIPoolingGetData(data, h1, w1, height, width) *
PrRoIPoolingGetCoeff(h - T(h1), w - T(w1));
h1 = floorf(h) + 1;
w1 = floorf(w);
retVal += PrRoIPoolingGetData(data, h1, w1, height, width) *
PrRoIPoolingGetCoeff(h - T(h1), w - T(w1));
h1 = floorf(h);
w1 = floorf(w) + 1;
retVal += PrRoIPoolingGetData(data, h1, w1, height, width) *
PrRoIPoolingGetCoeff(h - T(h1), w - T(w1));
h1 = floorf(h) + 1;
w1 = floorf(w) + 1;
retVal += PrRoIPoolingGetData(data, h1, w1, height, width) *
PrRoIPoolingGetCoeff(h - T(h1), w - T(w1));
return retVal;
}
template <typename T>
__device__ static T PrRoIPoolingMatCalculation(const T *this_data,
const int s_h, const int s_w,
const int e_h, const int e_w,
const T y0, const T x0,
const T y1, const T x1,
const int h0, const int w0) {
T alpha, beta, lim_alpha, lim_beta, tmp;
T sum_out = 0;
alpha = x0 - T(s_w);
beta = y0 - T(s_h);
lim_alpha = x1 - T(s_w);
lim_beta = y1 - T(s_h);
tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
0.5f * alpha * alpha) *
(lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
sum_out += PrRoIPoolingGetData(this_data, s_h, s_w, h0, w0) * tmp;
alpha = T(e_w) - x1;
lim_alpha = T(e_w) - x0;
tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
0.5f * alpha * alpha) *
(lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
sum_out += PrRoIPoolingGetData(this_data, s_h, e_w, h0, w0) * tmp;
alpha = x0 - T(s_w);
beta = T(e_h) - y1;
lim_alpha = x1 - T(s_w);
lim_beta = T(e_h) - y0;
tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
0.5f * alpha * alpha) *
(lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
sum_out += PrRoIPoolingGetData(this_data, e_h, s_w, h0, w0) * tmp;
alpha = T(e_w) - x1;
lim_alpha = T(e_w) - x0;
tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
0.5f * alpha * alpha) *
(lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
sum_out += PrRoIPoolingGetData(this_data, e_h, e_w, h0, w0) * tmp;
return sum_out;
}
template <typename T>
__device__ static void PrRoIPoolingDistributeDiff(T *diff, const T top_diff,
const int h, const int w,
const int height,
const int width,
const T coeff) {
bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width);
if (!overflow) atomicAdd(diff + h * width + w, top_diff * coeff);
}
template <typename T>
__device__ static void PrRoIPoolingMatDistributeDiff(
T *diff, const T top_diff, const int s_h, const int s_w, const int e_h,
const int e_w, const T y0, const T x0, const T y1, const T x1, const int h0,
const int w0) {
T alpha, beta, lim_alpha, lim_beta, tmp;
alpha = x0 - T(s_w);
beta = y0 - T(s_h);
lim_alpha = x1 - T(s_w);
lim_beta = y1 - T(s_h);
tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
0.5f * alpha * alpha) *
(lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
PrRoIPoolingDistributeDiff(diff, top_diff, s_h, s_w, h0, w0, tmp);
alpha = T(e_w) - x1;
lim_alpha = T(e_w) - x0;
tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
0.5f * alpha * alpha) *
(lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
PrRoIPoolingDistributeDiff(diff, top_diff, s_h, e_w, h0, w0, tmp);
alpha = x0 - T(s_w);
beta = T(e_h) - y1;
lim_alpha = x1 - T(s_w);
lim_beta = T(e_h) - y0;
tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
0.5f * alpha * alpha) *
(lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
PrRoIPoolingDistributeDiff(diff, top_diff, e_h, s_w, h0, w0, tmp);
alpha = T(e_w) - x1;
lim_alpha = T(e_w) - x0;
tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
0.5f * alpha * alpha) *
(lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
PrRoIPoolingDistributeDiff(diff, top_diff, e_h, e_w, h0, w0, tmp);
}
template <typename T>
__global__ void prroi_pool_forward_cuda_kernel(
const int nthreads, const T *input, const T *rois, T *output,
const int pooled_height, const int pooled_width, const T spatial_scale,
const int channels, const int height, const int width) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int c = (index / pooled_width / pooled_height) % channels;
int n = index / pooled_width / pooled_height / channels;
const T *offset_rois = rois + n * 5;
int roi_batch_ind = offset_rois[0];
T roi_x1 = offset_rois[1] * spatial_scale;
T roi_y1 = offset_rois[2] * spatial_scale;
T roi_x2 = offset_rois[3] * spatial_scale;
T roi_y2 = offset_rois[4] * spatial_scale;
T roi_width = max(roi_x2 - roi_x1, ((T)0.0));
T roi_height = max(roi_y2 - roi_y1, ((T)0.0));
T bin_size_h = roi_height / static_cast<T>(pooled_height);
T bin_size_w = roi_width / static_cast<T>(pooled_width);
const T *this_data =
input + (roi_batch_ind * channels + c) * height * width;
T *this_out = output + index;
T bin_x1 = roi_x1 + bin_size_w * pw;
T bin_y1 = roi_y1 + bin_size_h * ph;
T bin_x2 = bin_x1 + bin_size_w;
T bin_y2 = bin_y1 + bin_size_h;
T bin_size = max(T(0.0), bin_size_w * bin_size_h);
if (bin_size == 0) {
*this_out = 0;
continue;
}
T sum_out = 0;
int start_x, start_y, end_x, end_y;
start_x = floorf(bin_x1);
end_x = ceilf(bin_x2);
start_y = floorf(bin_y1);
end_y = ceilf(bin_y2);
for (int bin_x = start_x; bin_x < end_x; ++bin_x)
for (int bin_y = start_y; bin_y < end_y; ++bin_y)
sum_out += PrRoIPoolingMatCalculation(
this_data, bin_y, bin_x, bin_y + 1, bin_x + 1,
max(bin_y1, T(bin_y)), max(bin_x1, T(bin_x)),
min(bin_y2, T(bin_y) + 1.0f), min(bin_x2, T(bin_x + 1.0f)), height,
width);
*this_out = sum_out / bin_size;
}
}
template <typename T>
__global__ void prroi_pool_backward_cuda_kernel(
const int nthreads, const T *grad_output, const T *rois, T *grad_input,
const int pooled_height, const int pooled_width, const T spatial_scale,
const int channels, const int height, const int width) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int c = (index / pooled_width / pooled_height) % channels;
int n = index / pooled_width / pooled_height / channels;
rois += n * 5;
int roi_batch_ind = rois[0];
T roi_x1 = rois[1] * spatial_scale;
T roi_y1 = rois[2] * spatial_scale;
T roi_x2 = rois[3] * spatial_scale;
T roi_y2 = rois[4] * spatial_scale;
T roi_width = max(roi_x2 - roi_x1, (T)0);
T roi_height = max(roi_y2 - roi_y1, (T)0);
T bin_size_h = roi_height / static_cast<T>(pooled_height);
T bin_size_w = roi_width / static_cast<T>(pooled_width);
const T *this_out_grad = grad_output + index;
T *this_data_grad =
grad_input + (roi_batch_ind * channels + c) * height * width;
T bin_x1 = roi_x1 + bin_size_w * pw;
T bin_y1 = roi_y1 + bin_size_h * ph;
T bin_x2 = bin_x1 + bin_size_w;
T bin_y2 = bin_y1 + bin_size_h;
T bin_size = max(T(0.0), bin_size_w * bin_size_h);
T sum_out = bin_size == T(0) ? T(0) : *this_out_grad / bin_size;
int start_x, start_y, end_x, end_y;
start_x = floorf(bin_x1);
end_x = ceilf(bin_x2);
start_y = floorf(bin_y1);
end_y = ceilf(bin_y2);
for (int bin_x = start_x; bin_x < end_x; ++bin_x)
for (int bin_y = start_y; bin_y < end_y; ++bin_y)
PrRoIPoolingMatDistributeDiff(
this_data_grad, sum_out, bin_y, bin_x, bin_y + 1, bin_x + 1,
max(bin_y1, T(bin_y)), max(bin_x1, T(bin_x)),
min(bin_y2, T(bin_y) + 1.0f), min(bin_x2, T(bin_x + 1.0f)), height,
width);
}
}
template <typename T>
__global__ void prroi_pool_coor_backward_cuda_kernel(
const int nthreads, const T *output, const T *grad_output, const T *input,
const T *rois, T *grad_rois, const int pooled_height,
const int pooled_width, const T spatial_scale, const int channels,
const int height, const int width) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int c = (index / pooled_width / pooled_height) % channels;
int n = index / pooled_width / pooled_height / channels;
rois += n * 5;
int roi_batch_ind = rois[0];
T roi_x1 = rois[1] * spatial_scale;
T roi_y1 = rois[2] * spatial_scale;
T roi_x2 = rois[3] * spatial_scale;
T roi_y2 = rois[4] * spatial_scale;
T roi_width = max(roi_x2 - roi_x1, (T)0);
T roi_height = max(roi_y2 - roi_y1, (T)0);
T bin_size_h = roi_height / static_cast<T>(pooled_height);
T bin_size_w = roi_width / static_cast<T>(pooled_width);
const T output_grad_val = grad_output[index];
const T *this_input_data =
input + (roi_batch_ind * channels + c) * height * width;
const T output_val = output[index];
T *this_rois_grad = grad_rois + n * 5;
T bin_x1 = roi_x1 + bin_size_w * pw;
T bin_y1 = roi_y1 + bin_size_h * ph;
T bin_x2 = bin_x1 + bin_size_w;
T bin_y2 = bin_y1 + bin_size_h;
T bin_size = max(T(0.0), bin_size_w * bin_size_h);
T sum_out = bin_size == T(0) ? T(0) : output_grad_val / bin_size;
// WARNING: to be discussed
if (sum_out == 0) return;
int start_x, start_y, end_x, end_y;
start_x = floorf(bin_x1);
end_x = ceilf(bin_x2);
start_y = floorf(bin_y1);
end_y = ceilf(bin_y2);
T grad_x1_y = 0, grad_x2_y = 0, grad_x_y1 = 0, grad_x_y2 = 0;
for (int bin_y = start_y; bin_y < end_y; ++bin_y) {
grad_x1_y += PrRoIPoolingSingleCoorIntegral(
max(bin_y1, T(bin_y)) - bin_y, min(bin_y2, T(bin_y + 1)) - bin_y,
PrRoIPoolingInterpolation(this_input_data, float(bin_y), bin_x1,
height, width),
PrRoIPoolingInterpolation(this_input_data, float(bin_y + 1), bin_x1,
height, width));
grad_x2_y += PrRoIPoolingSingleCoorIntegral(
max(bin_y1, T(bin_y)) - bin_y, min(bin_y2, T(bin_y + 1)) - bin_y,
PrRoIPoolingInterpolation(this_input_data, float(bin_y), bin_x2,
height, width),
PrRoIPoolingInterpolation(this_input_data, float(bin_y + 1), bin_x2,
height, width));
}
for (int bin_x = start_x; bin_x < end_x; ++bin_x) {
grad_x_y1 += PrRoIPoolingSingleCoorIntegral(
max(bin_x1, T(bin_x)) - bin_x, min(bin_x2, T(bin_x + 1)) - bin_x,
PrRoIPoolingInterpolation(this_input_data, bin_y1, float(bin_x),
height, width),
PrRoIPoolingInterpolation(this_input_data, bin_y1, float(bin_x + 1),
height, width));
grad_x_y2 += PrRoIPoolingSingleCoorIntegral(
max(bin_x1, T(bin_x)) - bin_x, min(bin_x2, T(bin_x + 1)) - bin_x,
PrRoIPoolingInterpolation(this_input_data, bin_y2, float(bin_x),
height, width),
PrRoIPoolingInterpolation(this_input_data, bin_y2, float(bin_x + 1),
height, width));
}
T partial_x1 = -grad_x1_y + (bin_y2 - bin_y1) * output_val;
T partial_y1 = -grad_x_y1 + (bin_x2 - bin_x1) * output_val;
T partial_x2 = grad_x2_y - (bin_y2 - bin_y1) * output_val;
T partial_y2 = grad_x_y2 - (bin_x2 - bin_x1) * output_val;
partial_x1 = partial_x1 / bin_size * spatial_scale;
partial_x2 = partial_x2 / bin_size * spatial_scale;
partial_y1 = partial_y1 / bin_size * spatial_scale;
partial_y2 = partial_y2 / bin_size * spatial_scale;
// (index, x1, y1, x2, y2)
this_rois_grad[0] = 0;
atomicAdd(this_rois_grad + 1,
(partial_x1 * (1.0f - T(pw) / pooled_width) +
partial_x2 * (1.0f - T(pw + 1) / pooled_width)) *
output_grad_val);
atomicAdd(this_rois_grad + 2,
(partial_y1 * (1.0f - T(ph) / pooled_height) +
partial_y2 * (1.0f - T(ph + 1) / pooled_height)) *
output_grad_val);
atomicAdd(this_rois_grad + 3, (partial_x2 * T(pw + 1) / pooled_width +
partial_x1 * T(pw) / pooled_width) *
output_grad_val);
atomicAdd(this_rois_grad + 4, (partial_y2 * T(ph + 1) / pooled_height +
partial_y1 * T(ph) / pooled_height) *
output_grad_val);
}
}
#endif // ROI_POOL_CUDA_KERNEL_CUH
// Copyright © 2022 Apple Inc.
// This file is modify from:
// https://github.com/pytorch/pytorch/blob/a85d1f0bcdd02cf18d3b0517337458cb51a18cdb/aten/src/ATen/mps/MPSDevice.h
#pragma once
#include <ATen/ATen.h>
#include <c10/macros/Macros.h>
#include <c10/util/Exception.h>
#ifdef __OBJC__
#include <Foundation/Foundation.h>
#include <Metal/Metal.h>
#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
typedef id<MTLDevice> MTLDevice_t;
#else
typedef void* MTLDevice;
typedef void* MTLDevice_t;
#endif
using namespace std;
namespace at {
namespace mps {
//-----------------------------------------------------------------
// MPSDevice
//
// MPSDevice is a singleton class that returns the default device
//-----------------------------------------------------------------
class TORCH_API MPSDevice {
public:
/**
* MPSDevice should not be cloneable.
*/
MPSDevice(MPSDevice& other) = delete;
/**
* MPSDevice should not be assignable.
*/
void operator=(const MPSDevice&) = delete;
/**
* Gets single instance of the Device.
*/
static MPSDevice* getInstance();
/**
* Returns the single device.
*/
MTLDevice_t device() { return _mtl_device; }
~MPSDevice();
private:
static MPSDevice* _device;
MTLDevice_t _mtl_device;
MPSDevice();
};
TORCH_API bool is_available();
TORCH_API at::Allocator* GetMPSAllocator(bool useSharedAllocator = false);
} // namespace mps
} // namespace at
#ifndef _MPS_LIBRARY_H_
#define _MPS_LIBRARY_H_
#include <string>
#include <unordered_map>
#ifdef __OBJC__
#include <Foundation/Foundation.h>
#include <Metal/Metal.h>
#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
typedef id<MTLComputePipelineState> MTLComputePipelineState_t;
typedef id<MTLLibrary> MTLLibrary_t;
#else
typedef void* MTLComputePipelineState;
typedef void* MTLComputePipelineState_t;
typedef void* MTLLibrary;
typedef void* MTLLibrary_t;
#endif
class MPSLibrary {
public:
// disable constructor for singleton
static MPSLibrary* createFromUrl(const std::string& library_url);
static MPSLibrary* createFromSource(const std::string& source);
~MPSLibrary();
MTLLibrary_t library() { return _library; }
MTLComputePipelineState_t getComputePipelineState(
const std::string& function_name);
private:
MTLLibrary_t _library;
std::unordered_map<std::string, MTLComputePipelineState_t> _pso_map;
};
class MPSLibraryManager {
public:
// disable constructor for singleton
MPSLibraryManager(const MPSLibraryManager&) = delete;
MPSLibraryManager& operator=(const MPSLibraryManager&) = delete;
MPSLibraryManager(MPSLibraryManager&&) = delete;
MPSLibraryManager& operator=(MPSLibraryManager&&) = delete;
static MPSLibraryManager* getInstance();
bool hasLibrary(const std::string& name);
MPSLibrary* getLibrary(const std::string& library_url);
MPSLibrary* createLibraryFromSouce(const std::string& name,
const std::string& sources);
~MPSLibraryManager();
private:
MPSLibraryManager();
std::unordered_map<std::string, std::unique_ptr<MPSLibrary>> _library_map;
};
#endif
#include "MPSLibrary.h"
#include <c10/util/CallOnce.h>
#include "MPSDevice.h"
static std::unique_ptr<MPSLibraryManager> mps_library_manager;
static c10::once_flag mpsdev_init;
MPSLibraryManager* MPSLibraryManager::getInstance() {
c10::call_once(mpsdev_init, [] {
mps_library_manager = std::unique_ptr<MPSLibraryManager>(new MPSLibraryManager());
});
return mps_library_manager.get();
}
MPSLibraryManager::~MPSLibraryManager() {}
MPSLibraryManager::MPSLibraryManager() {}
bool MPSLibraryManager::hasLibrary(const std::string& name) {
return _library_map.find(name) != _library_map.end();
}
MPSLibrary* MPSLibraryManager::getLibrary(const std::string& library_url) {
if (_library_map.find(library_url) != _library_map.end()) {
return _library_map[library_url].get();
}
_library_map.emplace(std::make_pair(
library_url, std::unique_ptr<MPSLibrary>(MPSLibrary::createFromUrl(library_url))));
return _library_map[library_url].get();
}
MPSLibrary* MPSLibraryManager::createLibraryFromSouce(const std::string& name,
const std::string& source) {
NSString* ns_name = [NSString stringWithCString:name.c_str()];
if (_library_map.find(name) != _library_map.end()) {
NSLog(@"Library %@ already exist.", ns_name);
return nullptr;
}
_library_map.emplace(
std::make_pair(name, std::unique_ptr<MPSLibrary>(MPSLibrary::createFromSource(source))));
return _library_map[name].get();
}
MPSLibrary* MPSLibrary::createFromUrl(const std::string& library_url) {
MPSLibrary* library = new MPSLibrary();
@autoreleasepool {
NSError* error = nil;
// load library and func
NSString* utl_str = [NSString stringWithCString:library_url.c_str()];
NSURL* metal_url = [NSURL fileURLWithPath:utl_str];
library->_library = [at::mps::MPSDevice::getInstance()->device() newLibraryWithURL:metal_url
error:&error];
if (library->_library == nil) {
NSLog(@"Failed to find library, error %@.", error);
exit(1);
}
}
return library;
}
MPSLibrary* MPSLibrary::createFromSource(const std::string& sources) {
MPSLibrary* library = new MPSLibrary();
@autoreleasepool {
NSError* error = nil;
// load library and func
NSString* code_str = [NSString stringWithCString:sources.c_str()];
library->_library = [at::mps::MPSDevice::getInstance()->device() newLibraryWithSource:code_str
options:nil
error:&error];
if (library->_library == nil) {
NSLog(@"Failed to find library, error %@.", error);
exit(1);
}
}
return library;
}
MPSLibrary::~MPSLibrary() {
[_library release];
_library = nil;
}
MTLComputePipelineState_t MPSLibrary::getComputePipelineState(const std::string& function_name) {
if (_pso_map.find(function_name) != _pso_map.end()) {
return _pso_map[function_name];
}
MTLComputePipelineState_t pso;
@autoreleasepool {
NSError* error = nil;
// create function
NSString* function_name_str = [NSString stringWithCString:function_name.c_str()];
id<MTLFunction> func = [_library newFunctionWithName:function_name_str];
if (func == nil) {
NSLog(@"Failed to created pipeline state object, error %@.", error);
exit(1);
}
// create pipeline
pso = [at::mps::MPSDevice::getInstance()->device() newComputePipelineStateWithFunction:func
error:&error];
_pso_map.emplace(std::make_pair(function_name, pso));
}
return _pso_map[function_name];
}
// Copyright © 2022 Apple Inc.
// This file is modify from:
// https://github.com/pytorch/pytorch/blob/a85d1f0bcdd02cf18d3b0517337458cb51a18cdb/aten/src/ATen/mps/MPSStream.h
#pragma once
#include <cstdint>
#include <utility>
#include <c10/core/DeviceGuard.h>
#include <c10/core/Stream.h>
#include <c10/util/Exception.h>
#include "MPSDevice.h"
#ifdef __OBJC__
#include <Foundation/Foundation.h>
#include <Metal/Metal.h>
#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
#include <MetalPerformanceShadersGraph/MetalPerformanceShadersGraph.h>
typedef id<MTLCommandQueue> MTLCommandQueue_t;
typedef id<MTLCommandBuffer> MTLCommandBuffer_t;
typedef id<MTLSharedEvent> MTLSharedEvent_t;
typedef id<MTLDevice> MTLDevice_t;
#else
typedef void* MTLCommandQueue_t;
typedef void* MTLCommandQueue;
typedef void* MTLCommandBuffer_t;
typedef void* MTLCommandBuffer;
typedef void* MTLSharedEvent_t;
typedef void* dispatch_queue_t;
typedef void* MTLDevice_t;
#define nil NULL;
#endif
namespace at {
namespace mps {
//-----------------------------------------------------------------
// MPSStream
//-----------------------------------------------------------------
class TORCH_API MPSStream {
public:
enum Unchecked { UNCHECKED };
/// Construct a MPSStream from a Stream. This construction is checked,
/// and will raise an error if the Stream is not, in fact, a MPS stream.
explicit MPSStream(Stream stream);
~MPSStream();
MTLCommandQueue_t commandQueue() const { return _commandQueue; };
dispatch_queue_t queue() const { return _serialQueue; }
MTLCommandBuffer_t commandBuffer();
void commit(bool flush);
void commitAndWait();
void synchronize();
void flush();
/// Get the MPS device index that this stream is associated with.
c10::DeviceIndex device_index() const { return _stream.device_index(); }
MTLCommandQueue_t stream() const { return _commandQueue; };
MTLDevice_t device() const { return [_commandQueue device]; }
/// Explicit conversion to Stream.
Stream unwrap() const { return _stream; }
private:
Stream _stream;
MTLCommandQueue_t _commandQueue = nil;
MTLCommandBuffer_t _commandBuffer = nil;
void _flush(bool commitAndWait) const;
dispatch_queue_t _serialQueue = nullptr;
};
/**
* Get the current MPS stream
*/
TORCH_API MPSStream* getCurrentMPSStream();
/**
* Get the default MPS stream
*/
TORCH_API MPSStream* getDefaultMPSStream();
//-----------------------------------------------------------------
// MPSStreamImpl
//-----------------------------------------------------------------
class TORCH_API MPSStreamImpl {
public:
/**
* Gets single instance of the MPSStream.
*/
static MPSStream* getInstance();
private:
static MPSStream* _stream;
MPSStreamImpl();
};
//-----------------------------------------------------------------
// MPSEvent
//-----------------------------------------------------------------
struct TORCH_API MPSEvent {
MPSEvent();
// MPSEvent(id<MTLDevice> device);
~MPSEvent();
MTLSharedEvent_t event() const { return _event; }
void recordEvent(MPSStream* stream);
void waitForEvent(MPSStream* queue); // waits on the cpu
bool queryEvent();
uint64_t getCurrentValue() { return _currentValue; }
void setCurrentValue(uint64_t currValue) { _currentValue = currValue; }
private:
bool _isRecorded = false;
uint64_t _currentValue = 0;
MTLSharedEvent_t _event;
};
typedef MPSEvent* mpsEvent_t;
} // namespace mps
} // namespace at
#ifndef _MPS_UTILS_H_
#define _MPS_UTILS_H_
#include <torch/extension.h>
#ifdef __OBJC__
#include <Foundation/Foundation.h>
#include <Metal/Metal.h>
#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
typedef id<MTLBuffer> MTLBuffer_t;
typedef id<MTLComputeCommandEncoder> MTLComputeCommandEncoder_t;
#else
typedef void* MTLBuffer;
typedef void* MTLBuffer_t;
typedef void* MTLComputeCommandEncoder;
typedef void* MTLComputeCommandEncoder_t;
#endif
// utils
static inline MTLBuffer_t getMTLBufferStorage(const at::Tensor& tensor) {
return __builtin_bit_cast(MTLBuffer_t, tensor.storage().data());
}
template <typename T,
std::enable_if_t<!std::is_same<std::decay_t<T>, at::Tensor>::value, bool> = true>
void setMTLArg(MTLComputeCommandEncoder_t encoder, int index, T&& t);
template <typename T,
std::enable_if_t<std::is_same<std::decay_t<T>, at::Tensor>::value, bool> = true>
void setMTLArg(MTLComputeCommandEncoder_t encoder, int index, T&& t) {
[encoder setBuffer:getMTLBufferStorage(t) offset:0 atIndex:index];
}
template <typename T, std::enable_if_t<!std::is_same<std::decay_t<T>, at::Tensor>::value, bool>>
void setMTLArg(MTLComputeCommandEncoder_t encoder, int index, T&& t) {
[encoder setBytes:&t length:sizeof(t) atIndex:index];
}
inline void setMTLArgsImpl(MTLComputeCommandEncoder_t, int) {}
template <typename T, typename... Args>
void setMTLArgsImpl(MTLComputeCommandEncoder_t encoder, int index, T&& t, Args&&... args) {
setMTLArg(encoder, index, std::forward<T>(t));
setMTLArgsImpl(encoder, index + 1, std::forward<Args>(args)...);
}
template <typename... Args>
void setMTLArgs(MTLComputeCommandEncoder_t encoder, MTLComputePipelineState_t pso, Args&&... args) {
[encoder setComputePipelineState:pso];
setMTLArgsImpl(encoder, 0, std::forward<Args>(args)...);
}
#endif
...@@ -1737,3 +1737,54 @@ REGISTER_DEVICE_IMPL(chamfer_distance_forward_impl, CUDA, ...@@ -1737,3 +1737,54 @@ REGISTER_DEVICE_IMPL(chamfer_distance_forward_impl, CUDA,
chamfer_distance_forward_cuda); chamfer_distance_forward_cuda);
REGISTER_DEVICE_IMPL(chamfer_distance_backward_impl, CUDA, REGISTER_DEVICE_IMPL(chamfer_distance_backward_impl, CUDA,
chamfer_distance_backward_cuda); chamfer_distance_backward_cuda);
void PrROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
Tensor output, int pooled_height,
int pooled_width, float spatial_scale);
void PrROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
Tensor grad_input, int pooled_height,
int pooled_width, float spatial_scale);
void PrROIPoolCoorBackwardCUDAKernelLauncher(
Tensor output, Tensor grad_output, Tensor input, Tensor rois,
Tensor grad_rois, int pooled_height, int pooled_width, float spatial_scale);
void prroi_pool_forward_cuda(Tensor input, Tensor rois, Tensor output,
int pooled_height, int pooled_width,
float spatial_scale) {
PrROIPoolForwardCUDAKernelLauncher(input, rois, output, pooled_height,
pooled_width, spatial_scale);
}
void prroi_pool_backward_cuda(Tensor grad_output, Tensor rois,
Tensor grad_input, int pooled_height,
int pooled_width, float spatial_scale) {
PrROIPoolBackwardCUDAKernelLauncher(grad_output, rois, grad_input,
pooled_height, pooled_width,
spatial_scale);
}
void prroi_pool_coor_backward_cuda(Tensor output, Tensor grad_output,
Tensor input, Tensor rois, Tensor grad_rois,
int pooled_height, int pooled_width,
float spatial_scale) {
PrROIPoolCoorBackwardCUDAKernelLauncher(output, grad_output, input, rois,
grad_rois, pooled_height,
pooled_width, spatial_scale);
}
void prroi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
int pooled_height, int pooled_width,
float spatial_scale);
void prroi_pool_backward_impl(Tensor grad_output, Tensor rois,
Tensor grad_input, int pooled_height,
int pooled_width, float spatial_scale);
void prroi_pool_coor_backward_impl(Tensor output, Tensor grad_output,
Tensor input, Tensor rois, Tensor grad_rois,
int pooled_height, int pooled_width,
float spatial_scale);
REGISTER_DEVICE_IMPL(prroi_pool_forward_impl, CUDA, prroi_pool_forward_cuda);
REGISTER_DEVICE_IMPL(prroi_pool_backward_impl, CUDA, prroi_pool_backward_cuda);
REGISTER_DEVICE_IMPL(prroi_pool_coor_backward_impl, CUDA,
prroi_pool_coor_backward_cuda);
// Copyright (c) OpenMMLab. All rights reserved
#include "prroi_pool_cuda_kernel.cuh"
#include "pytorch_cuda_helper.hpp"
void PrROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
Tensor output, int pooled_height,
int pooled_width, float spatial_scale) {
int output_size = output.numel();
int channels = input.size(1);
int height = input.size(2);
int width = input.size(3);
at::cuda::CUDAGuard device_guard(input.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
prroi_pool_forward_cuda_kernel<float>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, input.data_ptr<float>(), rois.data_ptr<float>(),
output.data_ptr<float>(), pooled_height, pooled_width,
static_cast<float>(spatial_scale), channels, height, width);
AT_CUDA_CHECK(cudaGetLastError());
}
void PrROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
Tensor grad_input, int pooled_height,
int pooled_width,
float spatial_scale) {
int output_size = grad_output.numel();
int channels = grad_input.size(1);
int height = grad_input.size(2);
int width = grad_input.size(3);
at::cuda::CUDAGuard device_guard(grad_output.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
prroi_pool_backward_cuda_kernel<float>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, grad_output.data_ptr<float>(), rois.data_ptr<float>(),
grad_input.data_ptr<float>(), pooled_height, pooled_width,
static_cast<float>(spatial_scale), channels, height, width);
AT_CUDA_CHECK(cudaGetLastError());
}
void PrROIPoolCoorBackwardCUDAKernelLauncher(Tensor output, Tensor grad_output,
Tensor input, Tensor rois,
Tensor grad_rois,
int pooled_height,
int pooled_width,
float spatial_scale) {
int output_size = grad_output.numel();
int channels = input.size(1);
int height = input.size(2);
int width = input.size(3);
at::cuda::CUDAGuard device_guard(grad_output.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
prroi_pool_coor_backward_cuda_kernel<float>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, output.data_ptr<float>(), grad_output.data_ptr<float>(),
input.data_ptr<float>(), rois.data_ptr<float>(),
grad_rois.data_ptr<float>(), pooled_height, pooled_width,
static_cast<float>(spatial_scale), channels, height, width);
AT_CUDA_CHECK(cudaGetLastError());
}
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#include "pytorch_device_registry.hpp"
#include "MPSLibrary.h"
#include "MPSStream.h"
#include "MPSUtils.h"
using at::Tensor;
const static std::string kSourceCode = R"(
#include <metal_math>
#include <metal_stdlib>
using namespace metal;
kernel void bbox_overlap_mps_kernel(constant const float4* bboxes1,
constant const float4* bboxes2,
device float* ious,
constant int& num_bbox1,
constant int& num_bbox2,
constant int& mode,
constant bool& aligned,
constant int& offset,
uint index [[thread_position_in_grid]])
{
int base1 = index;
int base2 = index;
if(!aligned){
base1 = index / num_bbox2;
base2 = index % num_bbox2;
}
const float f_offset = float(offset);
const float4 b1 = bboxes1[base1];
const float b1_area = (b1[2]-b1[0]+f_offset)*(b1[3]-b1[1]+f_offset);
const float4 b2 = bboxes2[base2];
const float b2_area = (b2[2]-b2[0]+f_offset)*(b2[3]-b2[1]+f_offset);
const float2 left_top = fmax(b1.xy, b2.xy);
const float2 right_bottom = fmin(b1.zw, b2.zw);
const float2 wh = fmax(right_bottom - left_top + f_offset, 0.0f);
const float interS = wh.x * wh.y;
const float baseS =
fmax(mode == 0 ? b1_area + b2_area - interS : b1_area, f_offset);
ious[index] = interS / baseS;
}
)";
void BBoxOverlapsMPSKernelLauncher(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
const int mode, const bool aligned, const int offset) {
// get stream
auto stream = at::mps::getCurrentMPSStream();
auto library_manager = MPSLibraryManager::getInstance();
MPSLibrary* library;
const static std::string kLibraryName = "bbox_overlap";
if (library_manager->hasLibrary(kLibraryName))
library = library_manager->getLibrary(kLibraryName);
else
library = library_manager->createLibraryFromSouce(kLibraryName, kSourceCode);
auto func_pso = library->getComputePipelineState("bbox_overlap_mps_kernel");
// create command buffer and encoder
MTLCommandBuffer_t command_buffer = stream->commandBuffer();
MTLComputeCommandEncoder_t compute_encoder = [command_buffer computeCommandEncoder];
// set pso and buffer
int output_size = ious.numel();
int num_bbox1 = bboxes1.size(0);
int num_bbox2 = bboxes2.size(0);
int num_elements = output_size;
setMTLArgs(compute_encoder, func_pso, bboxes1, bboxes2, ious, num_bbox1, num_bbox2, mode, aligned,
offset);
// set grid size
MTLSize grid_size = MTLSizeMake(num_elements, 1, 1);
NSUInteger thread_group_size_x = func_pso.maxTotalThreadsPerThreadgroup;
if (thread_group_size_x > num_elements) {
thread_group_size_x = num_elements;
}
MTLSize thread_group_size = MTLSizeMake(thread_group_size_x, 1, 1);
// encoding
[compute_encoder dispatchThreads:grid_size threadsPerThreadgroup:thread_group_size];
[compute_encoder endEncoding];
// commit, not sure if flush is required
stream->commit(false);
}
void bbox_overlaps_mps(const Tensor bboxes1, const Tensor bboxes2, Tensor ious, const int mode,
const bool aligned, const int offset) {
BBoxOverlapsMPSKernelLauncher(bboxes1, bboxes2, ious, mode, aligned, offset);
}
void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious, const int mode,
const bool aligned, const int offset);
REGISTER_DEVICE_IMPL(bbox_overlaps_impl, MPS, bbox_overlaps_mps);
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
void prroi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
int pooled_height, int pooled_width,
float spatial_scale) {
DISPATCH_DEVICE_IMPL(prroi_pool_forward_impl, input, rois, output,
pooled_height, pooled_width, spatial_scale);
}
void prroi_pool_backward_impl(Tensor grad_output, Tensor rois,
Tensor grad_input, int pooled_height,
int pooled_width, float spatial_scale) {
DISPATCH_DEVICE_IMPL(prroi_pool_backward_impl, grad_output, rois, grad_input,
pooled_height, pooled_width, spatial_scale);
}
void prroi_pool_coor_backward_impl(Tensor output, Tensor grad_output,
Tensor input, Tensor rois, Tensor grad_rois,
int pooled_height, int pooled_width,
float spatial_scale) {
DISPATCH_DEVICE_IMPL(prroi_pool_coor_backward_impl, output, grad_output,
input, rois, grad_rois, pooled_height, pooled_width,
spatial_scale);
}
void prroi_pool_forward(Tensor input, Tensor rois, Tensor output,
int pooled_height, int pooled_width,
float spatial_scale) {
prroi_pool_forward_impl(input, rois, output, pooled_height, pooled_width,
spatial_scale);
}
void prroi_pool_backward(Tensor grad_output, Tensor rois, Tensor grad_input,
int pooled_height, int pooled_width,
float spatial_scale) {
prroi_pool_backward_impl(grad_output, rois, grad_input, pooled_height,
pooled_width, spatial_scale);
}
void prroi_pool_coor_backward(Tensor output, Tensor grad_output, Tensor input,
Tensor rois, Tensor grad_rois, int pooled_height,
int pooled_width, float spatial_scale) {
prroi_pool_coor_backward_impl(output, grad_output, input, rois, grad_rois,
pooled_height, pooled_width, spatial_scale);
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment