Commit 91da9643 authored by limm's avatar limm
Browse files

support v2.1.0

parent 6f674c7e
......@@ -672,12 +672,12 @@ static __global__ void filtered_lrelu_kernel(filtered_lrelu_kernel_params p) {
// Combine signs.
uint32_t s = sx + sy + sw + sz;
s <<= (signX & 3) << 1;
#ifndef MMCV_WITH_HIP
s |= __shfl_xor_sync(groupMask, s, 1);
s |= __shfl_xor_sync(groupMask, s, 2);
#else
#ifdef MMCV_WITH_HIP
s |= __shfl_xor(s, 1);
s |= __shfl_xor(s, 2);
#else
s |= __shfl_xor_sync(groupMask, s, 1);
s |= __shfl_xor_sync(groupMask, s, 2);
#endif
// Write signs.
......@@ -725,13 +725,14 @@ static __global__ void filtered_lrelu_kernel(filtered_lrelu_kernel_params p) {
// Combine signs.
uint32_t s = sx + sy + sw + sz;
s <<= (signX & 3) << 1;
#ifndef MMCV_WITH_HIP
s |= __shfl_xor_sync(groupMask, s, 1);
s |= __shfl_xor_sync(groupMask, s, 2);
#else
#ifdef MMCV_WITH_HIP
s |= __shfl_xor(s, 1);
s |= __shfl_xor(s, 2);
#else
s |= __shfl_xor_sync(groupMask, s, 1);
s |= __shfl_xor_sync(groupMask, s, 2);
#endif
// Write signs.
if ((uint32_t)(signY + 0) < sShapeMaxY) {
p.s[si0] = (unsigned char)(s >> 0);
......@@ -861,13 +862,14 @@ static __global__ void filtered_lrelu_kernel(filtered_lrelu_kernel_params p) {
// Combine signs.
int s = sx + sy;
s <<= signXo;
#ifndef MMCV_WITH_HIP
s |= __shfl_xor_sync(groupMask, s, 1);
s |= __shfl_xor_sync(groupMask, s, 2);
#else
#ifdef MMCV_WITH_HIP
s |= __shfl_xor(s, 1);
s |= __shfl_xor(s, 2);
#else
s |= __shfl_xor_sync(groupMask, s, 1);
s |= __shfl_xor_sync(groupMask, s, 2);
#endif
// Write signs.
if ((uint32_t)(signY + 0) < sShapeMaxY) {
p.s[si0] = (unsigned char)(s >> 0);
......@@ -895,13 +897,14 @@ static __global__ void filtered_lrelu_kernel(filtered_lrelu_kernel_params p) {
// Combine signs.
int s = sx + sy;
s <<= signXo;
#ifndef MMCV_WITH_HIP
s |= __shfl_xor_sync(groupMask, s, 1);
s |= __shfl_xor_sync(groupMask, s, 2);
#else
#ifdef MMCV_WITH_HIP
s |= __shfl_xor(s, 1);
s |= __shfl_xor(s, 2);
#else
s |= __shfl_xor_sync(groupMask, s, 1);
s |= __shfl_xor_sync(groupMask, s, 2);
#endif
// Write signs.
if ((uint32_t)(signY + 0) < sShapeMaxY) {
p.s[si0] = (unsigned char)(s >> 0);
......@@ -1188,14 +1191,14 @@ static __global__ void filtered_lrelu_kernel(filtered_lrelu_kernel_params p) {
}
if ((uint32_t)signXb < p.swLimit &&
(uint32_t)signY < p.sShape.y && signY >= minY) {
#ifndef MMCV_WITH_HIP
s += __shfl_xor_sync(groupMask, s, 1); // Coalesce.
s += __shfl_xor_sync(groupMask, s, 2); // Coalesce.
#else
#ifdef MMCV_WITH_HIP
s += __shfl_xor(s, 1); // Coalesce.
s += __shfl_xor(s, 2); // Coalesce.
#else
s += __shfl_xor_sync(groupMask, s, 1); // Coalesce.
s += __shfl_xor_sync(groupMask, s, 2); // Coalesce.
#endif
p.s[si] = s; // Write.
p.s[si] = s; // Write.
}
} else {
// Determine and write sign.
......@@ -1211,14 +1214,14 @@ static __global__ void filtered_lrelu_kernel(filtered_lrelu_kernel_params p) {
s = signXbit * 2;
v = InternalType<T>::clamp(v, p.clamp);
}
#ifndef MMCV_WITH_HIP
s += __shfl_xor_sync(groupMask, s, 1); // Coalesce.
s += __shfl_xor_sync(groupMask, s, 2); // Coalesce.
#else
#ifdef MMCV_WITH_HIP
s += __shfl_xor(s, 1); // Coalesce.
s += __shfl_xor(s, 2); // Coalesce.
#else
s += __shfl_xor_sync(groupMask, s, 1); // Coalesce.
s += __shfl_xor_sync(groupMask, s, 2); // Coalesce.
#endif
p.s[si] = s; // Write.
p.s[si] = s; // Write.
} else {
// Just compute the value.
if (v < 0.f) v *= p.slope;
......@@ -1438,17 +1441,18 @@ static __global__ void filtered_lrelu_act_kernel(
// Coalesce into threads 0 and 16 of warp.
uint32_t m = (threadIdx.x & 16) ? 0xffff0000u : 0x0000ffffu;
s <<= ((threadIdx.x & 15) << 1); // Shift into place.
#ifndef MMCV_WITH_HIP
s |= __shfl_xor_sync(m, s, 1); // Distribute.
s |= __shfl_xor_sync(m, s, 2);
s |= __shfl_xor_sync(m, s, 4);
s |= __shfl_xor_sync(m, s, 8);
#else
s |= __shfl_xor(s, 1); // Distribute.
#ifdef MMCV_WITH_HIP
s |= __shfl_xor(s, 1); // Distribute.
s |= __shfl_xor(s, 2);
s |= __shfl_xor(s, 4);
s |= __shfl_xor(s, 8);
#else
s |= __shfl_xor_sync(m, s, 1); // Distribute.
s |= __shfl_xor_sync(m, s, 2);
s |= __shfl_xor_sync(m, s, 4);
s |= __shfl_xor_sync(m, s, 8);
#endif
// Write signs if leader and in p.s.
if (!(threadIdx.x & 15) && x < p.sShape.x) // y is always in.
{
......@@ -1627,7 +1631,6 @@ filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel(
#endif
#endif
#if CUDA_VERSION < 10020
#undef BUILD_FILTERED_LRELU_OP
#define BUILD_FILTERED_LRELU_OP 0
......@@ -1673,11 +1676,15 @@ std::tuple<torch::Tensor, torch::Tensor, int> filtered_lrelu_op(
// Figure out how much shared memory is available on the device.
int maxSharedBytes = 0;
int result=cudaDeviceGetAttribute(&maxSharedBytes,
// cudaDevAttrMaxSharedMemoryPerBlockOptin,
// hipDeviceAttributeSharedMemPerBlockOptin,
hipDeviceAttributeMaxSharedMemoryPerBlock,
x.device().index());
#ifdef MMCV_WITH_HIP
cudaDeviceGetAttribute(&maxSharedBytes,
hipDeviceAttributeMaxSharedMemoryPerBlock,
x.device().index());
#else
AT_CUDA_CHECK(cudaDeviceGetAttribute(&maxSharedBytes,
cudaDevAttrMaxSharedMemoryPerBlockOptin,
x.device().index()));
#endif
int sharedKB = maxSharedBytes >> 10;
// Populate enough launch parameters to check if a CUDA kernel exists.
......@@ -1875,15 +1882,15 @@ std::tuple<torch::Tensor, torch::Tensor, int> filtered_lrelu_op(
p.tilesXrep = 0;
p.tilesXdim = 0;
}
#ifdef MMCV_WITH_HIP
AT_CUDA_CHECK(hipLaunchKernel(spec.setup, 1, 1024, args, 0,
at::cuda::getCurrentCUDAStream()));
#else
// Launch filter setup kernel.
#ifndef MMCV_WITH_HIP
AT_CUDA_CHECK(cudaLaunchKernel(spec.setup, 1, 1024, args, 0,
at::cuda::getCurrentCUDAStream()));
#else
AT_CUDA_CHECK(hipLaunchKernel(spec.setup, 1, 1024, args, 0,
at::cuda::getCurrentCUDAStream()));
#endif
// Copy kernels to constant memory.
if (writeSigns && !readSigns)
AT_CUDA_CHECK((copy_filters(at::cuda::getCurrentCUDAStream())));
......@@ -1895,11 +1902,15 @@ std::tuple<torch::Tensor, torch::Tensor, int> filtered_lrelu_op(
// Set cache and shared memory configurations for main kernel.
AT_CUDA_CHECK(cudaFuncSetCacheConfig(spec.exec, cudaFuncCachePreferShared));
if (spec.dynamicSharedKB) // Need dynamically allocated shared memory?
// AT_CUDA_CHECK(cudaFuncSetAttribute(
#ifdef MMCV_WITH_HIP
AT_CUDA_CHECK(hipFuncSetAttribute(
// spec.exec, cudaFuncAttributeMaxDynamicSharedMemorySize,
spec.exec, hipFuncAttributeMaxDynamicSharedMemorySize,
spec.dynamicSharedKB << 10));
#else
AT_CUDA_CHECK(cudaFuncSetAttribute(
spec.exec, cudaFuncAttributeMaxDynamicSharedMemorySize,
spec.dynamicSharedKB << 10));
#endif
AT_CUDA_CHECK(
cudaFuncSetSharedMemConfig(spec.exec, cudaSharedMemBankSizeFourByte));
......@@ -1910,12 +1921,12 @@ std::tuple<torch::Tensor, torch::Tensor, int> filtered_lrelu_op(
{
p.blockZofs = zofs;
int subGz = std::min(maxSubGz, gz - zofs);
#ifndef MMCV_WITH_HIP
AT_CUDA_CHECK(cudaLaunchKernel(spec.exec, dim3(gx, gy, subGz), bx, args,
spec.dynamicSharedKB << 10,
at::cuda::getCurrentCUDAStream()));
#else
#ifdef MMCV_WITH_HIP
AT_CUDA_CHECK(hipLaunchKernel(spec.exec, dim3(gx, gy, subGz), bx, args,
spec.dynamicSharedKB << 10,
at::cuda::getCurrentCUDAStream()));
#else
AT_CUDA_CHECK(cudaLaunchKernel(spec.exec, dim3(gx, gy, subGz), bx, args,
spec.dynamicSharedKB << 10,
at::cuda::getCurrentCUDAStream()));
#endif
......@@ -2033,12 +2044,13 @@ torch::Tensor filtered_lrelu_act_op(torch::Tensor x, torch::Tensor si, int sx,
gz = std::min(gz, gmax);
// Launch.
#ifndef MMCV_WITH_HIP
AT_CUDA_CHECK(cudaLaunchKernel(func, dim3(gx, gy, gz), bx, args, 0,
at::cuda::getCurrentCUDAStream()));
#else
#ifdef MMCV_WITH_HIP
AT_CUDA_CHECK(hipLaunchKernel(func, dim3(gx, gy, gz), bx, args, 0,
at::cuda::getCurrentCUDAStream()));
#else
AT_CUDA_CHECK(cudaLaunchKernel(func, dim3(gx, gy, gz), bx, args, 0,
at::cuda::getCurrentCUDAStream()));
#endif
return so;
}
......@@ -734,12 +734,13 @@ torch::Tensor upfirdn2d_op(torch::Tensor x, torch::Tensor f, int upx, int upy,
// Launch CUDA kernel.
void *args[] = {&p};
#ifndef MMCV_WITH_HIP
AT_CUDA_CHECK(cudaLaunchKernel(spec.kernel, gridSize, blockSize, args, 0,
at::cuda::getCurrentCUDAStream()));
#else
#ifdef MMCV_WITH_HIP
AT_CUDA_CHECK(hipLaunchKernel(spec.kernel, gridSize, blockSize, args, 0,
at::cuda::getCurrentCUDAStream()));
#else
AT_CUDA_CHECK(cudaLaunchKernel(spec.kernel, gridSize, blockSize, args, 0,
at::cuda::getCurrentCUDAStream()));
#endif
return y;
}
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_DIOPI
#include <diopi/diopirt.h>
#include <diopi/functions.h>
#include <diopi/functions_mmcv.h>
#include "csrc_dipu/diopirt/diopirt_impl.h"
using dipu::diopi_helper::toDiopiScalar;
using dipu::diopi_helper::toDiopiTensorHandle;
#endif
void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
Tensor output, float gamma, float alpha) {
......@@ -29,15 +39,92 @@ void softmax_focal_loss_backward_impl(Tensor input, Tensor target,
buff, grad_input, gamma, alpha);
}
#ifdef MMCV_WITH_DIOPI
void sigmoid_focal_loss_forward_diopi(Tensor input, Tensor target,
Tensor weight, Tensor output, float gamma,
float alpha) {
auto input_p = toDiopiTensorHandle(input);
diopiDevice_t device;
diopiGetTensorDevice(input_p, &device);
if (device == diopi_host) {
sigmoid_focal_loss_forward_impl(input, target, weight, output, gamma,
alpha);
return;
}
diopiContext ctx(dipu::getCurrentDIPUStream().rawstream());
diopiContextHandle_t ch = &ctx;
auto target_p = toDiopiTensorHandle(target);
auto weight_p = toDiopiTensorHandle(weight);
auto output_p = toDiopiTensorHandle(output);
if (reinterpret_cast<void *>(diopiSigmoidFocalLossMmcv) != nullptr) {
auto ret = diopiSigmoidFocalLossMmcv(ch, output_p, input_p, target_p,
weight_p, gamma, alpha);
if (ret == diopiSuccess) return;
}
LOG(WARNING)
<< "Fallback to cpu: mmcv ext op sigmoid_focal_loss_forward_impl";
auto input_cpu = input.cpu();
auto target_cpu = target.cpu();
auto weight_cpu = weight.cpu();
auto output_cpu = output.cpu();
sigmoid_focal_loss_forward_impl(input_cpu, target_cpu, weight_cpu, output_cpu,
gamma, alpha);
output.copy_(output_cpu);
return;
}
void sigmoid_focal_loss_backward_diopi(Tensor input, Tensor target,
Tensor weight, Tensor grad_input,
float gamma, float alpha) {
auto input_p = toDiopiTensorHandle(input);
diopiDevice_t device;
diopiGetTensorDevice(input_p, &device);
if (device == diopi_host) {
sigmoid_focal_loss_backward_impl(input, target, weight, grad_input, gamma,
alpha);
return;
}
diopiContext ctx(dipu::getCurrentDIPUStream().rawstream());
diopiContextHandle_t ch = &ctx;
auto target_p = toDiopiTensorHandle(target);
auto weight_p = toDiopiTensorHandle(weight);
auto grad_input_p = toDiopiTensorHandle(grad_input);
if (reinterpret_cast<void *>(diopiSigmoidFocalLossBackwardMmcv) != nullptr) {
auto ret = diopiSigmoidFocalLossBackwardMmcv(
ch, grad_input_p, input_p, target_p, weight_p, gamma, alpha);
if (ret == diopiSuccess) return;
}
LOG(WARNING)
<< "Fallback to cpu: mmcv ext op sigmoid_focal_loss_forward_impl";
auto input_cpu = input.cpu();
auto target_cpu = target.cpu();
auto weight_cpu = weight.cpu();
auto grad_input_cpu = grad_input.cpu();
sigmoid_focal_loss_backward_impl(input_cpu, target_cpu, weight_cpu,
grad_input_cpu, gamma, alpha);
grad_input.copy_(grad_input_cpu);
return;
}
#endif
void sigmoid_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
Tensor output, float gamma, float alpha) {
#ifdef MMCV_WITH_DIOPI
sigmoid_focal_loss_forward_diopi(input, target, weight, output, gamma, alpha);
#else
sigmoid_focal_loss_forward_impl(input, target, weight, output, gamma, alpha);
#endif
}
void sigmoid_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
Tensor grad_input, float gamma, float alpha) {
#ifdef MMCV_WITH_DIOPI
sigmoid_focal_loss_backward_diopi(input, target, weight, grad_input, gamma,
alpha);
#else
sigmoid_focal_loss_backward_impl(input, target, weight, grad_input, gamma,
alpha);
#endif
}
void softmax_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
......
/*************************************************************************
* Copyright (C) 2022 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "mlu_common_helper.h"
void ball_query_forward_mlu(int b, int n, int m, float min_radius,
float max_radius, int nsample, const Tensor new_xyz,
const Tensor xyz, Tensor idx) {
auto new_xyz_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
new_xyz, new_xyz.suggest_memory_format());
auto xyz_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
xyz, new_xyz.suggest_memory_format());
auto idx_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
idx, new_xyz.suggest_memory_format());
MluOpTensorDescriptor new_xyz_desc, xyz_desc, idx_desc;
new_xyz_desc.set(new_xyz_contiguous);
xyz_desc.set(xyz_contiguous);
idx_desc.set(idx_contiguous);
auto new_xyz_impl = torch_mlu::getMluTensorImpl(new_xyz_contiguous);
auto xyz_impl = torch_mlu::getMluTensorImpl(xyz_contiguous);
auto idx_impl = torch_mlu::getMluTensorImpl(idx_contiguous);
auto new_xyz_ptr = new_xyz_impl->cnnlMalloc();
auto xyz_ptr = xyz_impl->cnnlMalloc();
auto idx_ptr = idx_impl->cnnlMalloc();
auto handle = mluOpGetCurrentHandle();
TORCH_MLUOP_CHECK(mluOpBallQuery(
handle, new_xyz_desc.desc(), new_xyz_ptr, xyz_desc.desc(), xyz_ptr,
min_radius, max_radius, nsample, idx_desc.desc(), idx_ptr));
}
void ball_query_forward_impl(int b, int n, int m, float min_radius,
float max_radius, int nsample,
const Tensor new_xyz, const Tensor xyz,
Tensor idx);
REGISTER_DEVICE_IMPL(ball_query_forward_impl, MLU, ball_query_forward_mlu);
......@@ -10,36 +10,11 @@
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "pytorch_device_registry.hpp"
#include "pytorch_mlu_helper.hpp"
#include "mlu_common_helper.h"
void KernelBBoxOverlaps(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
cnrtQueue_t queue, const cnrtDataType_t d_type,
const void *bbox1, const void *bbox2, void *ious,
const int32_t num_bbox1, const int32_t num_bbox2,
const int32_t mode, const bool aligned,
const int32_t offset);
static void policyFunc(cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type,
const int32_t batch_num_all) {
auto union_num = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
auto core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
auto core_num = union_num * core_dim;
// Union1 policyFunc
*k_type = CNRT_FUNC_TYPE_UNION1;
k_dim->x = core_dim;
auto need_core_num = PAD_UP(batch_num_all, core_dim);
k_dim->y =
(need_core_num < core_num) ? (need_core_num / core_dim) : union_num;
k_dim->z = 1;
return;
}
void BBoxOverlapsMLUKernelLauncher(const Tensor bboxes1, const Tensor bboxes2,
Tensor ious, const int32_t mode,
const bool aligned, const int32_t offset) {
void bbox_overlaps_mlu(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
const int32_t mode, const bool aligned,
const int32_t offset) {
// check dtype
TORCH_CHECK(
bboxes1.scalar_type() == at::kFloat || bboxes1.scalar_type() == at::kHalf,
......@@ -63,38 +38,19 @@ void BBoxOverlapsMLUKernelLauncher(const Tensor bboxes1, const Tensor bboxes2,
return;
}
// calculate task dimension
cnrtDim3_t k_dim;
cnrtFunctionType_t k_type;
policyFunc(&k_dim, &k_type, batch_num_all);
INITIAL_MLU_PARAM_WITH_TENSOR(bboxes1);
INITIAL_MLU_PARAM_WITH_TENSOR(bboxes2);
INITIAL_MLU_PARAM_WITH_TENSOR(ious);
// get compute queue
cnrtQueue_t queue = torch_mlu::getCurQueue();
// get compute handle
auto handle = mluOpGetCurrentHandle();
// get dtype of input
cnrtDataType_t d_type = torch_mlu::toCnrtDtype(bboxes1.dtype());
// get ptr of tensors
auto bboxes1_impl = torch_mlu::getMluTensorImpl(bboxes1);
auto bboxes1_ptr = bboxes1_impl->cnnlMalloc();
auto bboxes2_impl = torch_mlu::getMluTensorImpl(bboxes2);
auto bboxes2_ptr = bboxes2_impl->cnnlMalloc();
auto ious_impl = torch_mlu::getMluTensorImpl(ious);
auto ious_ptr = ious_impl->cnnlMalloc();
// launch kernel
CNLOG(INFO) << "Launch Kernel MLUUnion1BboxOverlapsKernel";
CNLOG(INFO) << "kDim :[ " << k_dim.x << ", " << k_dim.y << ", " << k_dim.z
<< " ]";
KernelBBoxOverlaps(k_dim, k_type, queue, d_type, bboxes1_ptr, bboxes2_ptr,
ious_ptr, rows, cols, mode, aligned, offset);
}
void bbox_overlaps_mlu(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
const int mode, const bool aligned, const int offset) {
BBoxOverlapsMLUKernelLauncher(bboxes1, bboxes2, ious, mode, aligned, offset);
TORCH_MLUOP_CHECK(mluOpBboxOverlaps(
handle, mode, aligned, offset, bboxes1_desc.desc(), bboxes1_ptr,
bboxes2_desc.desc(), bboxes2_ptr, ious_desc.desc(), ious_ptr));
}
void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
const int mode, const bool aligned, const int offset);
REGISTER_DEVICE_IMPL(bbox_overlaps_impl, MLU, bbox_overlaps_mlu);
/*************************************************************************
* Copyright (C) 2022 by Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "mlu_common_helper.h"
void BoxIouRotatedMLUKernelLauncher(const Tensor boxes1, const Tensor boxes2,
Tensor ious, const int mode_flag,
const bool aligned) {
// get compute handle
auto handle = mluOpGetCurrentHandle();
auto boxes1_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
boxes1, boxes1.suggest_memory_format());
auto boxes2_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
boxes2, boxes2.suggest_memory_format());
auto ious_contiguous =
torch_mlu::cnnl::ops::cnnl_contiguous(ious, ious.suggest_memory_format());
MluOpTensorDescriptor boxes1_desc, boxes2_desc, ious_desc;
boxes1_desc.set(boxes1_contiguous);
boxes2_desc.set(boxes2_contiguous);
ious_desc.set(ious_contiguous);
auto boxes1_impl = torch_mlu::getMluTensorImpl(boxes1_contiguous);
auto boxes2_impl = torch_mlu::getMluTensorImpl(boxes2_contiguous);
auto ious_impl = torch_mlu::getMluTensorImpl(ious_contiguous);
auto boxes1_ptr = boxes1_impl->cnnlMalloc();
auto boxes2_ptr = boxes2_impl->cnnlMalloc();
auto ious_ptr = ious_impl->cnnlMalloc();
CNLOG(INFO) << "Call mluOpBoxIouRotated().";
TORCH_MLUOP_CHECK(mluOpBoxIouRotated(
handle, mode_flag, aligned, boxes1_desc.desc(), boxes1_ptr,
boxes2_desc.desc(), boxes2_ptr, ious_desc.desc(), ious_ptr));
}
void box_iou_rotated_mlu(const Tensor boxes1, const Tensor boxes2, Tensor ious,
const int mode_flag, const bool aligned) {
BoxIouRotatedMLUKernelLauncher(boxes1, boxes2, ious, mode_flag, aligned);
}
void box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
const int mode_flag, const bool aligned);
REGISTER_DEVICE_IMPL(box_iou_rotated_impl, MLU, box_iou_rotated_mlu);
......@@ -9,200 +9,13 @@
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "carafe_utils.hpp"
#include "pytorch_device_registry.hpp"
#include "pytorch_mlu_helper.hpp"
void KernelCarafeForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
cnrtQueue_t queue, const cnrtDataType_t d_type,
const void *input, const void *mask,
const CarafeForwardParam &param,
const CarafeForwardBlockDim &block_dim,
const CarafeForwardGridDim &grid_dim, void *output);
void KernelCarafeBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
cnrtQueue_t queue, cnrtDataType_t dtype,
const void *input, const void *mask,
const void *grad_output, void *grad_input,
void *grad_mask, const int n, const int hi,
const int wi, const int c, const int k_up,
const int group, const int scale);
// Get total NRAM usage and set strides of NRAM arrays.
static void getNramUsage(CarafeForwardParam *param,
CarafeForwardBlockDim *block_dim, int *nram_usage) {
// input_nram[blkDim_(Hi+Kh)-1, blkDim_(Wi+Kw)-1, blkDim_G, blkDim_Cg]
block_dim->Hi = CEIL_DIV(block_dim->Ho, param->scale_factor) + 1;
block_dim->Wi = CEIL_DIV(block_dim->Wo, param->scale_factor) + 1;
param->input_nram_stride_g = PAD_UP(block_dim->Cg, param->align_size_NRAM);
param->input_nram_stride_w = param->input_nram_stride_g * block_dim->G;
param->input_nram_stride_h =
(block_dim->Wi + block_dim->Kw - 1) * param->input_nram_stride_w;
param->input_nram_size =
(block_dim->Hi + block_dim->Kh - 1) * param->input_nram_stride_h;
// mask_nram[blkDim_Ho, blkDim_Wo, blkDim_G, blkDim_Kh, blkDim_Kw]
param->mask_nram_stride_kh = block_dim->Kw;
param->mask_nram_stride_g = block_dim->Kh * param->mask_nram_stride_kh;
param->mask_nram_stride_w = block_dim->G * param->mask_nram_stride_g;
param->mask_nram_stride_h = block_dim->Wo * param->mask_nram_stride_w;
param->mask_nram_size =
PAD_UP(block_dim->Ho * param->mask_nram_stride_h, param->align_size_NRAM);
// output_nram[blkDim_Ho, blkDim_Wo, blkDim_(G*Cg)]
param->output_nram_stride_g = param->input_nram_stride_g;
param->output_nram_stride_w =
PAD_UP(param->input_nram_stride_w, param->align_size_NFU);
param->output_nram_stride_h = block_dim->Wo * param->output_nram_stride_w;
param->output_nram_size = block_dim->Ho * param->output_nram_stride_h;
// sum_array[blkDim_(G*Cg)]
// ensure the last mul_const on Cg does not exceed memory boundary
int sum_array_size_bang_mul_const =
(block_dim->G - 1) * param->input_nram_stride_g +
PAD_UP(param->input_nram_stride_g, param->align_size_NFU);
int sum_array_size =
std::max(param->output_nram_stride_w, sum_array_size_bang_mul_const);
*nram_usage = param->input_nram_size + param->mask_nram_size +
param->output_nram_size + sum_array_size;
}
// Policy Function for Forward
static void genPolicyForward(CarafeForwardParam *param,
CarafeForwardBlockDim *block_dim,
CarafeForwardGridDim *grid_dim, cnrtDim3_t *k_dim,
cnrtFunctionType_t *k_type) {
// device info
auto core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
auto cluster_num = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
auto core_num = core_dim * cluster_num;
// maximum NRAM size as the number of <dtype>
auto max_nram_size =
torch_mlu::getDeviceAttr(cnrtAttrNramSizePerMcore) / param->dtype_size;
// determine grid and block dimensions
// set initial values for block_dim and grid_dim
block_dim->Ho = param->Ho;
block_dim->Wo = param->Wo;
block_dim->Kh = param->kernel_size;
block_dim->Kw = param->kernel_size;
block_dim->G = param->group_size;
block_dim->Cg = param->Cg;
grid_dim->Ho = 1;
grid_dim->Wo = 1;
grid_dim->Kh = 1;
grid_dim->Kw = 1;
grid_dim->G = 1;
grid_dim->Cg = 1;
// decrease the block size to fit in the NRAM.
int nram_usage = 0;
while (true) {
getNramUsage(param, block_dim, &nram_usage);
if (nram_usage > max_nram_size) {
// decrease Ho
// decrease block_Ho and block_Wo evenly
// so that the block is close to a square.
if (block_dim->Ho > 1 && block_dim->Ho >= block_dim->Wo) {
grid_dim->Ho += 1;
block_dim->Ho = CEIL_DIV(param->Ho, grid_dim->Ho);
} else if (block_dim->Wo > 1 && block_dim->Wo > block_dim->Ho) {
// decrease Wo
grid_dim->Wo += 1;
block_dim->Wo = CEIL_DIV(param->Wo, grid_dim->Wo);
} else if (block_dim->Kh > 1) {
// decrease Kh
grid_dim->Kh += 1;
block_dim->Kh = CEIL_DIV(param->kernel_size, grid_dim->Kh);
// reset Hi, Wi to maximize NRAM usage
grid_dim->Ho = 1;
block_dim->Ho = param->Ho;
grid_dim->Wo = 1;
block_dim->Wo = param->Wo;
} else if (block_dim->Kw > 1) {
// decrease Kw
grid_dim->Kw += 1;
block_dim->Kw = CEIL_DIV(param->kernel_size, grid_dim->Kw);
// reset Kh
grid_dim->Kh = 1;
block_dim->Kh = param->kernel_size;
} else if (block_dim->G > 1) {
// decrease G
grid_dim->G += 1;
block_dim->G = CEIL_DIV(param->group_size, grid_dim->G);
// reset Kw
grid_dim->Kw = 1;
block_dim->Kw = param->kernel_size;
} else if (block_dim->Cg > 1) {
// decrease block_Cg
// This is done in the last since c is the continuous dim
// (input layout is NHWC) and large c can improve
// IO & compute efficiency.
grid_dim->Cg += 1;
block_dim->Cg = CEIL_DIV(param->Cg, grid_dim->Cg);
// reset G
grid_dim->G = 1;
block_dim->G = param->group_size;
} else {
// the block volume is one now, cannot decrease the block size anymore!
// this situation should not occur.
break;
}
} else {
break;
}
}
// define parameters depending on block_dim, grid_dim
param->block_Cg_NFU = PAD_UP(block_dim->Cg, param->align_size_NFU);
// define host arrays' strides
// input[N,H,W,G,Cg]
param->input_stride_g = param->Cg;
param->input_stride_w = param->Ci;
param->input_stride_h = param->Wi * param->input_stride_w;
param->input_stride_n = param->Hi * param->input_stride_h;
// mask[N,Ho,Wo,G,Kh,Kw]
param->mask_stride_kh = param->kernel_size;
param->mask_stride_g = param->kernel_size * param->mask_stride_kh;
param->mask_stride_w = param->group_size * param->mask_stride_g;
param->mask_stride_h = param->Wo * param->mask_stride_w;
param->mask_stride_n = param->Ho * param->mask_stride_h;
// output[N,Ho,Wo,G,Cg]
param->output_stride_g = param->Cg;
param->output_stride_w = param->Ci;
param->output_stride_h = param->Wo * param->output_stride_w;
param->output_stride_n = param->Ho * param->output_stride_h;
param->job_num =
param->N * grid_dim->Ho * grid_dim->Wo * grid_dim->G * grid_dim->Cg;
// determine task type and dims
*k_type = CNRT_FUNC_TYPE_BLOCK;
k_dim->x = std::min(param->job_num, static_cast<int>(core_num));
k_dim->y = 1;
k_dim->z = 1;
}
#include "mlu_common_helper.h"
void CARAFEForwardMLUKernelLauncher(const Tensor input, const Tensor mask,
Tensor rinput, Tensor routput, Tensor rmask,
Tensor output, const int kernel_size,
const int group_size,
const int scale_factor) {
const int batch_size = output.size(0);
const int channels = output.size(1);
const int ho = output.size(2);
const int wo = output.size(3);
// check tensor data type
TORCH_CHECK(
input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf,
......@@ -221,37 +34,10 @@ void CARAFEForwardMLUKernelLauncher(const Tensor input, const Tensor mask,
// return fast on zero-element tensor
if (output.numel() == 0) {
output = at::zeros({batch_size, channels, ho, wo}, output.options());
output = at::zeros(output.sizes().vec(), output.options());
return;
}
// set param
CarafeForwardParam param;
param.N = input.size(0);
param.Ci = input.size(1);
param.Hi = input.size(2);
param.Wi = input.size(3);
param.kernel_size = kernel_size;
param.group_size = group_size;
param.scale_factor = scale_factor;
param.Cg = param.Ci / group_size;
param.dtype_size = input.itemsize();
param.align_size_NRAM = NRAM_ALIGN_SIZE / param.dtype_size;
param.align_size_NFU = NFU_ALIGN_SIZE / param.dtype_size;
param.kernel_size_sq = param.kernel_size * param.kernel_size;
param.kernel_size_half = (param.kernel_size - 1) / 2;
param.Ho = param.Hi * param.scale_factor;
param.Wo = param.Wi * param.scale_factor;
// generate policy
cnrtDim3_t k_dim;
cnrtFunctionType_t k_type;
CarafeForwardBlockDim block_dim;
CarafeForwardGridDim grid_dim;
genPolicyForward(&param, &block_dim, &grid_dim, &k_dim, &k_type);
// convert NCHW to NHWC
auto memory_format_input_nhwc =
torch_mlu::cnnl::ops::get_channels_last_memory_format(input.dim());
......@@ -268,6 +54,12 @@ void CARAFEForwardMLUKernelLauncher(const Tensor input, const Tensor mask,
auto routput_ =
torch_mlu::cnnl::ops::cnnl_contiguous(output, memory_format_output_nhwc);
// set tensor descriptor
MluOpTensorDescriptor input_desc, mask_desc, output_desc;
input_desc.set_with_layout(rinput_, MLUOP_LAYOUT_NHWC);
mask_desc.set_with_layout(rmask_, MLUOP_LAYOUT_NHWC);
output_desc.set_with_layout(routput_, MLUOP_LAYOUT_NHWC);
// get ptr of tensors
auto input_impl = torch_mlu::getMluTensorImpl(rinput_);
auto input_ptr = input_impl->cnnlMalloc();
......@@ -276,45 +68,29 @@ void CARAFEForwardMLUKernelLauncher(const Tensor input, const Tensor mask,
auto output_impl = torch_mlu::getMluTensorImpl(routput_);
auto output_ptr = output_impl->cnnlMalloc();
// get compute queue
auto queue = torch_mlu::getCurQueue();
// get dtype of input
cnrtDataType_t d_type = torch_mlu::toCnrtDtype(input.dtype());
// set op descriptor
auto handle = mluOpGetCurrentHandle();
mluOpCarafeDescriptor_t carafe_desc;
TORCH_MLUOP_CHECK(mluOpCreateCarafeDescriptor(&carafe_desc));
TORCH_MLUOP_CHECK(mluOpSetCarafeDescriptor(
carafe_desc, input.dim(), kernel_size, group_size, scale_factor));
// launch kernel
auto core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
CNLOG(INFO) << "Launch Kernel KernelCarafeForward<<<Union"
<< k_type / core_dim << ", " << k_dim.x << ", " << k_dim.y << ", "
<< k_dim.z << ">>>";
KernelCarafeForward(k_dim, k_type, queue, d_type, input_ptr, mask_ptr, param,
block_dim, grid_dim, output_ptr);
TORCH_MLUOP_CHECK(mluOpCarafeForward(handle, carafe_desc, input_desc.desc(),
input_ptr, mask_desc.desc(), mask_ptr,
output_desc.desc(), output_ptr));
// destroy op descriptor
TORCH_MLUOP_CHECK(mluOpDestroyCarafeDescriptor(carafe_desc));
// copy output from NHWC back into NCHW
rinput.copy_(rinput_);
output.copy_(routput_);
}
// Policy Function for Backward
static void policyFuncBackward(cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type) {
// set Union1 Job
*k_type = CNRT_FUNC_TYPE_UNION1;
k_dim->x = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
k_dim->y = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
k_dim->z = 1;
}
void CARAFEBackwardMLUKernelLauncher(
const Tensor grad_output, const Tensor rinput, const Tensor mask,
Tensor rgrad_output, Tensor rgrad_input_hs, Tensor rgrad_input,
Tensor rgrad_mask, Tensor grad_input, Tensor grad_mask,
const int kernel_size, const int group_size, const int scale_factor) {
const int batch_size = rinput.size(0);
const int channels = rinput.size(1);
const int hi = rinput.size(2);
const int wi = rinput.size(3);
// data type check
TORCH_CHECK(grad_output.scalar_type() == at::kFloat ||
grad_output.scalar_type() == at::kHalf,
......@@ -331,11 +107,6 @@ void CARAFEBackwardMLUKernelLauncher(
TORCH_CHECK(kernel_size < 137, "kernel_size should be less than 137, got ",
kernel_size);
// set task dimension
cnrtDim3_t k_dim;
cnrtFunctionType_t k_type;
policyFuncBackward(&k_dim, &k_type);
// convert NCHW to NHWC
auto memory_format_input_nhwc =
torch_mlu::cnnl::ops::get_channels_last_memory_format(rinput.dim());
......@@ -363,8 +134,15 @@ void CARAFEBackwardMLUKernelLauncher(
auto rgrad_mask_ = torch_mlu::cnnl::ops::cnnl_contiguous(
grad_mask, memory_format_grad_mask_nhwc);
// get compute queue
auto queue = torch_mlu::getCurQueue();
// set tensor descriptor
MluOpTensorDescriptor input_desc, mask_desc;
input_desc.set_with_layout(rinput_, MLUOP_LAYOUT_NHWC);
mask_desc.set_with_layout(rmask_, MLUOP_LAYOUT_NHWC);
MluOpTensorDescriptor grad_output_desc, grad_input_desc, grad_mask_desc;
grad_output_desc.set_with_layout(rgrad_output_, MLUOP_LAYOUT_NHWC);
grad_input_desc.set_with_layout(rgrad_input_, MLUOP_LAYOUT_NHWC);
grad_mask_desc.set_with_layout(rgrad_mask_, MLUOP_LAYOUT_NHWC);
// get ptr of tensors
auto input_impl = torch_mlu::getMluTensorImpl(rinput_);
......@@ -378,19 +156,20 @@ void CARAFEBackwardMLUKernelLauncher(
auto grad_mask_impl = torch_mlu::getMluTensorImpl(rgrad_mask_);
auto grad_mask_ptr = grad_mask_impl->cnnlMalloc();
// get dtype of grad_output
cnrtDataType_t d_type = torch_mlu::toCnrtDtype(grad_output.dtype());
auto core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
CNLOG(INFO) << "Launch Kernel KernelCarafeBackward<<<Union"
<< k_type / core_dim << ", " << k_dim.x << ", " << k_dim.y << ", "
<< k_dim.z << ">>>";
// set op descriptor
auto handle = mluOpGetCurrentHandle();
mluOpCarafeDescriptor_t carafe_desc;
TORCH_MLUOP_CHECK(mluOpCreateCarafeDescriptor(&carafe_desc));
TORCH_MLUOP_CHECK(mluOpSetCarafeDescriptor(
carafe_desc, grad_output.dim(), kernel_size, group_size, scale_factor));
// launch kernel
KernelCarafeBackward(k_dim, k_type, queue, d_type, input_ptr, mask_ptr,
grad_output_ptr, grad_input_ptr, grad_mask_ptr,
batch_size, hi, wi, channels, kernel_size, group_size,
scale_factor);
TORCH_MLUOP_CHECK(mluOpCarafeBackward(
handle, carafe_desc, input_desc.desc(), input_ptr, mask_desc.desc(),
mask_ptr, grad_output_desc.desc(), grad_output_ptr,
grad_input_desc.desc(), grad_input_ptr, grad_mask_desc.desc(),
grad_mask_ptr));
// destroy op descriptor
TORCH_MLUOP_CHECK(mluOpDestroyCarafeDescriptor(carafe_desc));
// copy output from NHWC back into NCHW
grad_input.copy_(rgrad_input_);
......
......@@ -9,254 +9,59 @@
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "pytorch_device_registry.hpp"
#include "pytorch_mlu_helper.hpp"
void KernelDeformRoIPoolForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
cnrtQueue_t queue, cnrtDataType_t data_type,
const void *input, const void *rois,
const void *offset, void *output,
const int channels, const int height,
const int width, const int num_rois,
const int pooled_height, const int pooled_width,
const float spatial_scale,
const int sampling_ratio, const float gamma);
void KernelDeformRoIPoolBackward(
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
cnrtDataType_t data_type, const void *grad_output, const void *input,
const void *rois, const void *offset, void *grad_input, void *grad_offset,
const int channels, const int height, const int width, const int num_rois,
const int pooled_height, const int pooled_width, const float spatial_scale,
const int sampling_ratio, const float gamma);
// policy function for forward and backward
static void policyFunc(const int bin_num, cnrtDim3_t *k_dim,
cnrtFunctionType_t *k_type) {
const size_t cluster_limit = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
;
const size_t core_limit = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
const size_t bin_num_align = CEIL_ALIGN(bin_num, core_limit);
k_dim->x = core_limit;
k_dim->y = (bin_num_align / core_limit) > cluster_limit
? cluster_limit
: (bin_num_align / core_limit);
k_dim->z = 1;
*k_type = CNRT_FUNC_TYPE_UNION1;
}
#include "mlu_common_helper.h"
void DeformRoIPoolForwardMLUKernelLauncher(Tensor input, Tensor rois,
Tensor offset, Tensor output,
int pooled_height, int pooled_width,
float spatial_scale,
int sampling_ratio, float gamma) {
// Check dtype.
TORCH_CHECK(
input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf,
"input type should be Float or Half, got ", input.scalar_type());
TORCH_CHECK(input.scalar_type() == rois.scalar_type(),
"rois should have the same type as input");
// Check shape.
TORCH_CHECK(input.dim() == 4, "input should be 4d tensor, got ", input.dim(),
"D.");
TORCH_CHECK(rois.dim() == 2, "rois should be 2d tensor, got ", rois.dim(),
"D.");
if (offset.defined() && offset.numel() > 0) {
TORCH_CHECK(input.scalar_type() == offset.scalar_type(),
"offset should have the same type as input");
TORCH_CHECK(offset.dim() == 4, "offset should be 4d tensor, got ",
offset.dim(), "D.");
TORCH_CHECK(
(offset.size(0) == rois.size(0)), "offset.size(0) = ", offset.size(0),
"while rois.size(0)) = ", rois.size(0), ". They should be the same.");
TORCH_CHECK((offset.size(1) == 2), "offset.size(1) should be 2, ",
"but now offset.size(1) = ", offset.size(1), ".");
TORCH_CHECK((offset.size(2) == output.size(2)),
"offset.size(2) = ", offset.size(2),
"while output.size(2)) = ", output.size(2),
". They should be the same.");
TORCH_CHECK((offset.size(3) == output.size(3)),
"offset.size(3) = ", offset.size(3),
"while output.size(3)) = ", output.size(3),
". They should be the same.");
}
TORCH_CHECK(spatial_scale > 0 && spatial_scale <= 1,
"spatial_scale should be within (0, 1], got ", spatial_scale,
".");
// compute kernel params
auto height = input.size(2);
auto width = input.size(3);
auto channels = input.size(1);
auto num_rois = output.size(0);
if (output.numel() == 0) {
output = at::zeros({num_rois, channels, pooled_height, pooled_width},
input.options());
return;
}
// zero element check
TORCH_CHECK(input.size(0) != 0, "input.size(0) should not be zero, got ",
input.size(0));
TORCH_CHECK(rois.numel() != 0, "rois.numel() should not be zero, got ",
rois.numel());
if (input.numel() == 0 || output.numel() == 0) {
return;
}
// large tensor check
const size_t max_input_num = 2147483648; // 2^31, 2G num
TORCH_CHECK(input.numel() < max_input_num,
"input.numel() should be less than 2147483648, got ",
input.numel());
TORCH_CHECK(rois.numel() < max_input_num,
"rois.numel() should be less than 2147483648, got ",
rois.numel());
TORCH_CHECK(output.numel() < max_input_num,
"output.numel() should be less than 2147483648, got ",
output.numel());
TORCH_CHECK(!offset.defined() || offset.numel() < max_input_num,
"offset.numel() should be less than 2147483648, got ",
offset.numel());
auto memory_format =
torch_mlu::cnnl::ops::get_channels_last_memory_format(input.dim());
auto input_ = torch_mlu::cnnl::ops::cnnl_contiguous(input, memory_format);
at::Tensor output_ =
at::empty({num_rois, channels, pooled_height, pooled_width},
input.options(), memory_format);
// calculate task dimension
cnrtDim3_t k_dim;
cnrtFunctionType_t k_type;
policyFunc(num_rois * pooled_height * pooled_width, &k_dim, &k_type);
// get compute queue
auto queue = torch_mlu::getCurQueue();
auto rois_contiguous =
torch_mlu::cnnl::ops::cnnl_contiguous(rois, rois.suggest_memory_format());
auto output_contiguous =
torch_mlu::cnnl::ops::cnnl_contiguous(output, memory_format);
MluOpTensorDescriptor input_desc, rois_desc, offset_desc, output_desc;
input_desc.set_with_layout(input_, MLUOP_LAYOUT_NHWC);
rois_desc.set(rois_contiguous);
output_desc.set_with_layout(output_contiguous, MLUOP_LAYOUT_NHWC);
mluOpTensorDescriptor_t offset_real_desc = NULL;
void *offset_ptr = NULL;
if (offset.defined() && offset.numel() > 0) {
auto offset_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
offset, offset.suggest_memory_format());
offset_desc.set(offset_contiguous);
offset_real_desc = offset_desc.desc();
auto offset_impl = torch_mlu::getMluTensorImpl(offset_contiguous);
offset_ptr = offset_impl->cnnlMalloc();
}
// get ptr of tensors
auto input_impl = torch_mlu::getMluTensorImpl(input_);
auto input_ptr = input_impl->cnnlMalloc();
auto rois_impl = torch_mlu::getMluTensorImpl(rois);
auto rois_impl = torch_mlu::getMluTensorImpl(rois_contiguous);
auto rois_ptr = rois_impl->cnnlMalloc();
auto offset_impl = torch_mlu::getMluTensorImpl(offset);
auto offset_ptr = offset_impl->cnnlMalloc();
auto output_impl = torch_mlu::getMluTensorImpl(output_);
auto output_impl = torch_mlu::getMluTensorImpl(output_contiguous);
auto output_ptr = output_impl->cnnlMalloc();
// get comput dtype of input
cnrtDataType_t data_type = torch_mlu::toCnrtDtype(input_.dtype());
// get compute handle
auto handle = mluOpGetCurrentHandle();
TORCH_MLUOP_CHECK(mluOpDeformRoiPoolForward(
handle, input_desc.desc(), input_ptr, rois_desc.desc(), rois_ptr,
offset_real_desc, offset_ptr, pooled_height, pooled_width, spatial_scale,
sampling_ratio, gamma, output_desc.desc(), output_ptr));
// launch kernel
CNLOG(INFO) << "Launch Kernel MLUKernelDeformRoIPoolForward<<<" << k_dim.x
<< ", " << k_dim.y << ", " << k_dim.z << ">>>";
KernelDeformRoIPoolForward(k_dim, k_type, queue, data_type, input_ptr,
rois_ptr, offset_ptr, output_ptr, channels, height,
width, num_rois, pooled_height, pooled_width,
spatial_scale, sampling_ratio, gamma);
output.copy_(output_);
output.copy_(output_contiguous);
}
void DeformRoIPoolBackwardMLUKernelLauncher(
Tensor grad_output, Tensor input, Tensor rois, Tensor offset,
Tensor grad_input, Tensor grad_offset, int pooled_height, int pooled_width,
float spatial_scale, int sampling_ratio, float gamma) {
// Check dtype.
TORCH_CHECK(
input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf,
"input type should be Float or Half, got ", input.scalar_type());
TORCH_CHECK(input.scalar_type() == grad_output.scalar_type(),
"grad_output should have the same type as input");
TORCH_CHECK(input.scalar_type() == rois.scalar_type(),
"rois should have the same type as input");
TORCH_CHECK(input.scalar_type() == grad_input.scalar_type(),
"grad_input should have the same type as input");
// Check shape.
TORCH_CHECK(grad_output.dim() == 4, "grad_output should be 4d tensor, got ",
grad_output.dim(), "D.");
TORCH_CHECK(input.dim() == 4, "input should be 4d tensor, got ", input.dim(),
"D.");
TORCH_CHECK(rois.dim() == 2, "rois should be 2d tensor, got ", rois.dim(),
"D.");
if (offset.defined() && offset.numel() > 0) {
TORCH_CHECK(input.scalar_type() == offset.scalar_type(),
"offset should have the same type as input");
TORCH_CHECK(offset.dim() == 4, "offset should be 4d tensor, got ",
offset.dim(), "D.");
TORCH_CHECK(
(offset.size(0) == rois.size(0)), "offset.size(0) = ", offset.size(0),
"while rois.size(0)) = ", rois.size(0), ". They should be the same.");
TORCH_CHECK((offset.size(1) == 2), "offset.size(1) should be 2, ",
"but now offset.size(1) = ", offset.size(1), ".");
TORCH_CHECK((offset.size(2) == grad_output.size(2)),
"offset.size(2) = ", offset.size(2),
"while grad_output.size(2)) = ", grad_output.size(2),
". They should be the same.");
TORCH_CHECK((offset.size(3) == grad_output.size(3)),
"offset.size(3) = ", offset.size(3),
"while grad_output.size(3)) = ", grad_output.size(3),
". They should be the same.");
}
TORCH_CHECK(spatial_scale > 0 && spatial_scale <= 1,
"spatial_scale should be within (0, 1], got ", spatial_scale);
// Check relationship between tensor.
TORCH_CHECK((grad_output.size(0) == rois.size(0)),
"grad_output.size(0) = ", grad_output.size(0),
"while rois.size(0)) = ", rois.size(0),
". They should be the same.");
TORCH_CHECK((grad_output.size(1) == input.size(1)),
"grad_output.size(1) = ", grad_output.size(1),
"while input.size(1)) = ", input.size(1),
". They should be the same.");
TORCH_CHECK((grad_output.size(2) == pooled_height),
"grad_output.size(2) = ", grad_output.size(2),
"while pooled_height = ", pooled_height,
". They should be the same.");
TORCH_CHECK((grad_output.size(3) == pooled_width),
"grad_output.size(3) = ", grad_output.size(3),
"while pooled_width = ", pooled_width,
". They should be the same.");
// compute kernel params
auto batch = input.size(0);
auto channels = input.size(1);
auto height = input.size(2);
auto width = input.size(3);
auto num_rois = grad_output.size(0);
// zero element check
TORCH_CHECK(input.size(0) != 0, "input.size(0) should not be zero, got ",
input.size(0));
TORCH_CHECK(rois.numel() != 0, "rois.numel() should not be zero, got ",
rois.numel());
if (input.numel() == 0 || grad_output.numel() == 0) {
return;
}
// large tensor check
const size_t max_input_num = 2147483648; // 2^31, 2G num
TORCH_CHECK(input.numel() < max_input_num,
"input.numel() should be less than 2147483648, got ",
input.numel());
TORCH_CHECK(rois.numel() < max_input_num,
"rois.numel() should be less than 2147483648, got ",
rois.numel());
TORCH_CHECK(grad_output.numel() < max_input_num,
"grad_output.numel() should be less than 2147483648, got ",
grad_output.numel());
TORCH_CHECK(!offset.defined() || offset.numel() < max_input_num,
"offset.numel() should be less than 2147483648, got ",
offset.numel());
auto memory_format =
torch_mlu::cnnl::ops::get_channels_last_memory_format(grad_output.dim());
auto grad_output_ =
......@@ -264,45 +69,56 @@ void DeformRoIPoolBackwardMLUKernelLauncher(
memory_format =
torch_mlu::cnnl::ops::get_channels_last_memory_format(input.dim());
auto input_ = torch_mlu::cnnl::ops::cnnl_contiguous(input, memory_format);
at::Tensor grad_input_ = at::empty({batch, channels, height, width},
input.options(), memory_format)
.zero_();
// calculate task dimension
cnrtDim3_t k_dim;
cnrtFunctionType_t k_type;
policyFunc(num_rois * pooled_height * pooled_width, &k_dim, &k_type);
// get compute queue
auto queue = torch_mlu::getCurQueue();
auto rois_contiguous =
torch_mlu::cnnl::ops::cnnl_contiguous(rois, rois.suggest_memory_format());
auto grad_input_ =
torch_mlu::cnnl::ops::cnnl_contiguous(grad_input, memory_format);
// get ptr of tensors
auto grad_output_impl = torch_mlu::getMluTensorImpl(grad_output_);
auto grad_output_ptr = grad_output_impl->cnnlMalloc();
auto input_impl = torch_mlu::getMluTensorImpl(input_);
auto input_ptr = input_impl->cnnlMalloc();
auto rois_impl = torch_mlu::getMluTensorImpl(rois);
auto rois_impl = torch_mlu::getMluTensorImpl(rois_contiguous);
auto rois_ptr = rois_impl->cnnlMalloc();
auto offset_impl = torch_mlu::getMluTensorImpl(offset);
auto offset_ptr = offset_impl->cnnlMalloc();
auto grad_input_impl = torch_mlu::getMluTensorImpl(grad_input_);
auto grad_input_ptr = grad_input_impl->cnnlMalloc();
auto grad_offset_impl = torch_mlu::getMluTensorImpl(grad_offset);
auto grad_offset_ptr = grad_offset_impl->cnnlMalloc();
// get comput dtype of input
cnrtDataType_t data_type = torch_mlu::toCnrtDtype(input.dtype());
// launch kernel
CNLOG(INFO) << "Launch Kernel KernelDeformRoIPoolBackward<<<" << k_dim.x
<< ", " << k_dim.y << ", " << k_dim.z << ">>>";
KernelDeformRoIPoolBackward(k_dim, k_type, queue, data_type, grad_output_ptr,
input_ptr, rois_ptr, offset_ptr, grad_input_ptr,
grad_offset_ptr, channels, height, width,
num_rois, pooled_height, pooled_width,
spatial_scale, sampling_ratio, gamma);
MluOpTensorDescriptor grad_output_desc, input_desc, rois_desc, offset_desc,
grad_input_desc, grad_offset_desc;
grad_output_desc.set_with_layout(grad_output_, MLUOP_LAYOUT_NHWC);
input_desc.set_with_layout(input_, MLUOP_LAYOUT_NHWC);
rois_desc.set(rois_contiguous);
grad_input_desc.set_with_layout(grad_input_, MLUOP_LAYOUT_NHWC);
mluOpTensorDescriptor_t offset_real_desc = NULL;
void *offset_ptr = NULL;
if (offset.defined() && offset.numel() > 0) {
auto offset_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
offset, offset.suggest_memory_format());
offset_desc.set(offset_contiguous);
offset_real_desc = offset_desc.desc();
auto offset_impl = torch_mlu::getMluTensorImpl(offset_contiguous);
offset_ptr = offset_impl->cnnlMalloc();
}
mluOpTensorDescriptor_t grad_offset_real_desc = NULL;
void *grad_offset_ptr = NULL;
if (grad_offset.defined() && grad_offset.numel() > 0) {
auto grad_offset_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
grad_offset, grad_offset.suggest_memory_format());
grad_offset_desc.set(grad_offset_contiguous);
grad_offset_real_desc = grad_offset_desc.desc();
auto grad_offset_impl = torch_mlu::getMluTensorImpl(grad_offset_contiguous);
grad_offset_ptr = grad_offset_impl->cnnlMalloc();
}
// get compute handle
auto handle = mluOpGetCurrentHandle();
TORCH_MLUOP_CHECK(mluOpDeformRoiPoolBackward(
handle, grad_output_desc.desc(), grad_output_ptr, input_desc.desc(),
input_ptr, rois_desc.desc(), rois_ptr, offset_real_desc, offset_ptr,
pooled_height, pooled_width, spatial_scale, sampling_ratio, gamma,
grad_input_desc.desc(), grad_input_ptr, grad_offset_real_desc,
grad_offset_ptr));
grad_input.copy_(grad_input_);
}
......
/*************************************************************************
* Copyright (C) 2023 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "mlu_common_helper.h"
Tensor diff_iou_rotated_sort_vertices_forward_mlu(Tensor vertices, Tensor mask,
Tensor num_valid) {
// params check
TORCH_CHECK(vertices.scalar_type() == at::kFloat,
"vertices type should be Float, got ", vertices.scalar_type());
TORCH_CHECK(mask.scalar_type() == at::kBool, "mask should be Bool, got ",
mask.scalar_type());
TORCH_CHECK(num_valid.scalar_type() == at::kInt,
"num_valid type should be Int32, got ", num_valid.scalar_type());
TORCH_CHECK(vertices.size(2) == 24, "vertices.dim(2) should be 24, got ",
vertices.size(2));
TORCH_CHECK(mask.size(2) == 24, "mask.dim(2) should be 24, got ",
mask.size(2));
// zero-element check
if (vertices.numel() == 0) {
return at::empty({0}, num_valid.options().dtype(at::kInt));
}
auto idx = at::empty({vertices.size(0), vertices.size(1), 9},
num_valid.options().dtype(at::kInt));
INITIAL_MLU_PARAM_WITH_TENSOR(vertices);
INITIAL_MLU_PARAM_WITH_TENSOR(mask);
INITIAL_MLU_PARAM_WITH_TENSOR(num_valid);
INITIAL_MLU_PARAM_WITH_TENSOR(idx);
// get compute handle
auto handle = mluOpGetCurrentHandle();
// launch kernel
TORCH_MLUOP_CHECK(mluOpDiffIouRotatedSortVerticesForward(
handle, vertices_desc.desc(), vertices_ptr, mask_desc.desc(), mask_ptr,
num_valid_desc.desc(), num_valid_ptr, idx_desc.desc(), idx_ptr));
return idx;
}
Tensor diff_iou_rotated_sort_vertices_forward_impl(Tensor vertices, Tensor mask,
Tensor num_valid);
REGISTER_DEVICE_IMPL(diff_iou_rotated_sort_vertices_forward_impl, MLU,
diff_iou_rotated_sort_vertices_forward_mlu);
......@@ -12,87 +12,11 @@
#include <string>
#include <vector>
#include "pytorch_device_registry.hpp"
#include "pytorch_mlu_helper.hpp"
#include "mlu_common_helper.h"
void KernelFocalLossSigmoidForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
cnrtQueue_t queue,
const cnrtDataType_t d_type,
const void *input, const void *target,
const void *weight, const int32_t N,
const int32_t C, const float alpha,
const float gamma, void *output);
void KernelFocalLossSigmoidBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
cnrtQueue_t queue,
const cnrtDataType_t d_type,
const void *input, const void *target,
const void *weight, const float gamma,
const float alpha, const int32_t dim_n,
const int32_t deal_n, const int32_t dim_c,
void *output);
// Policy Function for Forward
static void policyFuncForward(cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type,
const Tensor &input, const Tensor &target,
const Tensor &weight) {
auto N = input.size(0);
auto C = input.size(1);
const size_t nram_size = torch_mlu::getDeviceAttr(cnrtAttrNramSizePerMcore);
const size_t c_align_size = PAD_UP((C * input.itemsize()), NFU_ALIGN_SIZE);
const int split_target_num = 2;
const int split_pipeline_num = 6;
const int has_weight = weight.data_ptr() != nullptr;
const int target_data_width = target.scalar_type() == at::kLong
? target.itemsize() / 2
: target.itemsize();
const int threshold_c =
PAD_DOWN((nram_size - split_target_num * sizeof(int)) /
(split_pipeline_num + has_weight),
NFU_ALIGN_SIZE) /
input.itemsize();
int n_seg = 1;
if (C <= threshold_c) {
int c_size = C * input.itemsize();
int reservered_align_size =
(split_target_num + split_pipeline_num) * NFU_ALIGN_SIZE;
int wegiht_size = 0;
if (has_weight) {
c_size = c_align_size;
reservered_align_size = split_target_num * NFU_ALIGN_SIZE;
wegiht_size = c_align_size;
}
// n_seg * c_size * split_pipeline_num + n_seg * target.itemsize() *
// split_target_num
// + weight_size + reservered_align_size <= nram_size
n_seg = (nram_size - wegiht_size - reservered_align_size) /
(split_pipeline_num * c_size + split_target_num * sizeof(int32_t));
}
auto seg_num = n_seg == 0 ? N : (N + n_seg - 1) / n_seg;
auto core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
auto cluster_num = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
auto core_num = core_dim * cluster_num;
k_dim->x = *k_type;
k_dim->y =
seg_num > core_num ? cluster_num : (seg_num + core_dim - 1) / core_dim;
k_dim->z = 1;
}
// Policy Function for Backward
static void policyFuncBackward(cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type) {
// set Union1 Job
*k_type = CNRT_FUNC_TYPE_UNION1;
k_dim->x = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
k_dim->y = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
k_dim->z = 1;
}
void SigmoidFocalLossForwardMLUKernelLauncher(Tensor input, Tensor target,
Tensor weight, Tensor output,
const float gamma,
const float alpha) {
void sigmoid_focal_loss_forward_mlu(Tensor input, Tensor target, Tensor weight,
Tensor output, const float gamma,
const float alpha) {
// params check
TORCH_CHECK(gamma >= 0, "gamma should be greater than or equal to 0. ",
"But now gamma is ", gamma, ".");
......@@ -123,103 +47,50 @@ void SigmoidFocalLossForwardMLUKernelLauncher(Tensor input, Tensor target,
return;
}
// calculate task dimension
cnrtDim3_t k_dim;
cnrtFunctionType_t k_type = CNRT_FUNC_TYPE_UNION1;
policyFuncForward(&k_dim, &k_type, input, target, weight);
auto core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
// get compute queue
auto queue = torch_mlu::getCurQueue();
// contiguous
auto input_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
input, input.suggest_memory_format());
// target only support in32
auto target_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
target.toType(at::kInt), target.suggest_memory_format());
auto weight_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
weight, weight.suggest_memory_format());
auto output_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
output, output.suggest_memory_format());
// set tensor descriptor
MluOpTensorDescriptor input_desc, target_desc, weight_desc, output_desc;
input_desc.set(input_contiguous);
target_desc.set(target_contiguous);
weight_desc.set(weight_contiguous);
output_desc.set(output_contiguous);
// get ptr of tensors
auto input_impl = torch_mlu::getMluTensorImpl(input);
auto input_impl = torch_mlu::getMluTensorImpl(input_contiguous);
auto input_ptr = input_impl->cnnlMalloc();
auto target_impl = torch_mlu::getMluTensorImpl(target);
auto target_impl = torch_mlu::getMluTensorImpl(target_contiguous);
auto target_ptr = target_impl->cnnlMalloc();
auto weight_impl = torch_mlu::getMluTensorImpl(weight);
auto weight_impl = torch_mlu::getMluTensorImpl(weight_contiguous);
auto weight_ptr = weight_impl->cnnlMalloc();
auto output_impl = torch_mlu::getMluTensorImpl(output);
auto output_impl = torch_mlu::getMluTensorImpl(output_contiguous);
auto output_ptr = output_impl->cnnlMalloc();
// get dtype of input
cnrtDataType_t d_type = torch_mlu::toCnrtDtype(input.dtype());
CNLOG(INFO) << "Launch Kernel KernelFocalLossSigmoidForward<<<Union"
<< k_type / core_dim << ", " << k_dim.x << ", " << k_dim.y << ", "
<< k_dim.z << ">>>";
// launch kernel
KernelFocalLossSigmoidForward(k_dim, k_type, queue, d_type, input_ptr,
target_ptr, weight_ptr, input.size(0),
input.size(1), alpha, gamma, output_ptr);
}
void getDealNAndThresholdC(const int compute_data_bytes,
const int target_data_bytes, const int total_c,
int *deal_n_ptr, int *threshold_c_ptr,
const bool has_weight, const bool is_half) {
/* NRAM partition:
*
* |-----------------ping pong--------------------|
* |input | pt | alpha_t | temp | output | target | flt_min | gamma | weight|
*
* split_pipeline_num is 5: including input, pt, alpha_t, temp, output.
*/
const int nram_split_num = 5;
const int nram_split_pingpong = 2;
const int max_nram_size = torch_mlu::getDeviceAttr(cnrtAttrNramSizePerMcore);
int32_t compute_align_size = NFU_ALIGN_SIZE;
if (is_half) {
compute_align_size += NFU_ALIGN_SIZE;
}
const int32_t compute_align_num = compute_align_size / compute_data_bytes;
// reservered_align_size: including input(ping pong), pt(ping pong),
// alpha_t(ping pong), temp(ping pong),
// output(ping pong), target(ping pong),
// flt_min and gamma.
const int reservered_align_size =
((nram_split_num + 1) * nram_split_pingpong + 2) * compute_align_size;
int nram_pingpong_size = max_nram_size - reservered_align_size;
// set prefer computation performance and redcuntion approach
mluOpComputationPreference_t prefer = MLUOP_COMPUTATION_FAST;
mluOpLossReduction_t reduction = MLUOP_LOSS_REDUCTION_NONE;
int compute_c = total_c;
int threshold_c = 0;
if (has_weight) {
// reserved space for weight to align
nram_pingpong_size -= NFU_ALIGN_SIZE;
auto handle = mluOpGetCurrentHandle();
// threshold_c * nram_split_pingpong * compute_data_bytes * nram_split_num +
// nram_split_pingpong * target_data_bytes +
// threshold_c * compute_data_bytes <= nram_pingpong_size
threshold_c =
(nram_pingpong_size - nram_split_pingpong * target_data_bytes) /
(compute_data_bytes * (nram_split_num * nram_split_pingpong + 1));
threshold_c = PAD_DOWN(threshold_c, compute_align_num);
int weight_space = PAD_UP(total_c * compute_data_bytes, NFU_ALIGN_SIZE);
// reserved space for weight
nram_pingpong_size -= weight_space;
compute_c = PAD_UP(total_c, compute_align_num);
} else {
// threshold_c * nram_split_pingpong * compute_data_bytes * nram_split_num +
// nram_split_pingpong * target_data_bytes <= nram_pingpong_size
threshold_c =
(nram_pingpong_size / nram_split_pingpong - target_data_bytes) /
(nram_split_num * compute_data_bytes);
}
// deal_n * compute_c * nram_split_pingpong * compute_data_bytes *
// nram_split_num + deal_n * nram_split_pingpong * target_data_bytes <=
// nram_pingpong_size
*deal_n_ptr =
nram_pingpong_size /
((nram_split_num * compute_c * compute_data_bytes + target_data_bytes) *
nram_split_pingpong);
*threshold_c_ptr = threshold_c;
// launch kernel
TORCH_MLUOP_CHECK(mluOpFocalLossSigmoidForward(
handle, prefer, reduction, input_desc.desc(), input_ptr,
target_desc.desc(), target_ptr, weight_desc.desc(), weight_ptr, alpha,
gamma, output_desc.desc(), output_ptr));
}
void SigmoidFocalLossBackwardMLUKernelLauncher(Tensor input, Tensor target,
Tensor weight, Tensor output,
const float gamma,
const float alpha) {
void sigmoid_focal_loss_backward_mlu(Tensor input, Tensor target, Tensor weight,
Tensor output, const float gamma,
const float alpha) {
// params check
TORCH_CHECK(gamma >= 0, "gamma should be greater than or equal to 0. ",
"But now gamma is ", gamma, ".");
......@@ -246,77 +117,51 @@ void SigmoidFocalLossBackwardMLUKernelLauncher(Tensor input, Tensor target,
CNLOG(INFO) << "weight is a empty tensor.";
}
auto dim_c = input.size(1);
const int compute_data_bytes = sizeof(float);
// target supports only INT on MLU device while it keeps LONG on host side,
// so target.itemsize() / 2
const int target_data_bytes = target.scalar_type() == at::kLong
? (target.itemsize() / 2)
: target.itemsize();
int deal_n = 0;
int threshold_c = 0;
bool is_half = false;
if (input.scalar_type() == at::kHalf) {
is_half = true;
}
// calculate deal_n and threshold_c
getDealNAndThresholdC(compute_data_bytes, target_data_bytes, dim_c, &deal_n,
&threshold_c, has_weight, is_half);
// check C
TORCH_CHECK(threshold_c >= dim_c,
"input.size(1) should be in the range of [0, ", threshold_c,
"]. ", "But now input.size(1) is ", dim_c, ".");
if (input.numel() == 0 || target.numel() == 0 || output.numel() == 0) {
// return if zero-element
return;
}
// set task dimension
cnrtDim3_t k_dim;
cnrtFunctionType_t k_type;
policyFuncBackward(&k_dim, &k_type);
// get compute queue
auto queue = torch_mlu::getCurQueue();
// contiguous
auto input_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
input, input.suggest_memory_format());
// only support in32
auto target_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
target.toType(at::kInt), target.suggest_memory_format());
auto weight_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
weight, weight.suggest_memory_format());
auto output_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
output, output.suggest_memory_format());
// set tensor descriptor
MluOpTensorDescriptor input_desc, target_desc, weight_desc, output_desc;
input_desc.set(input_contiguous);
target_desc.set(target_contiguous);
weight_desc.set(weight_contiguous);
output_desc.set(output_contiguous);
// get ptr of tensors
auto input_impl = torch_mlu::getMluTensorImpl(input);
auto input_impl = torch_mlu::getMluTensorImpl(input_contiguous);
auto input_ptr = input_impl->cnnlMalloc();
auto target_impl = torch_mlu::getMluTensorImpl(target);
auto target_impl = torch_mlu::getMluTensorImpl(target_contiguous);
auto target_ptr = target_impl->cnnlMalloc();
auto weight_impl = torch_mlu::getMluTensorImpl(weight);
auto weight_impl = torch_mlu::getMluTensorImpl(weight_contiguous);
auto weight_ptr = weight_impl->cnnlMalloc();
auto output_impl = torch_mlu::getMluTensorImpl(output);
auto output_impl = torch_mlu::getMluTensorImpl(output_contiguous);
auto output_ptr = output_impl->cnnlMalloc();
// get dtype of input
cnrtDataType_t d_type = torch_mlu::toCnrtDtype(input.dtype());
auto core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
auto dim_n = input.size(0);
// set prefer computation performance and redcuntion approach
// backward only support MLUOP_COMPUTATION_HIGH_PRECISION
mluOpComputationPreference_t prefer = MLUOP_COMPUTATION_HIGH_PRECISION;
mluOpLossReduction_t reduction = MLUOP_LOSS_REDUCTION_NONE;
CNLOG(INFO) << "Launch Kernel KernelFocalLossSigmoidBackward<<<Union"
<< k_type / core_dim << ", " << k_dim.x << ", " << k_dim.y << ", "
<< k_dim.z << ">>>";
auto handle = mluOpGetCurrentHandle();
// launch kernel
KernelFocalLossSigmoidBackward(k_dim, k_type, queue, d_type, input_ptr,
target_ptr, weight_ptr, gamma, alpha, dim_n,
deal_n, dim_c, output_ptr);
}
void sigmoid_focal_loss_forward_mlu(Tensor input, Tensor target, Tensor weight,
Tensor output, float gamma, float alpha) {
SigmoidFocalLossForwardMLUKernelLauncher(input, target, weight, output, gamma,
alpha);
}
void sigmoid_focal_loss_backward_mlu(Tensor input, Tensor target, Tensor weight,
Tensor grad_input, float gamma,
float alpha) {
SigmoidFocalLossBackwardMLUKernelLauncher(input, target, weight, grad_input,
gamma, alpha);
TORCH_MLUOP_CHECK(mluOpFocalLossSigmoidBackward(
handle, prefer, reduction, input_desc.desc(), input_ptr,
target_desc.desc(), target_ptr, weight_desc.desc(), weight_ptr, alpha,
gamma, output_desc.desc(), output_ptr));
}
void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
......
......@@ -10,114 +10,31 @@
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "pytorch_device_registry.hpp"
#include "pytorch_mlu_helper.hpp"
void KernelIou3d(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
const cnrtDataType_t data_type_input, const void *boxes_dram,
const int input_box_num, const float iou_threshold,
void *workspace, void *output_size, void *output);
int selectType(uint32_t use_job, int box_num_per_core) {
// the box_num_per_core should be at least 256, otherwise the real IO
// bandwidth would be very low
while (box_num_per_core < 256 && use_job >= 4) {
box_num_per_core *= 2;
use_job /= 2;
}
return use_job;
}
static cnnlStatus_t policyFunc(cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type,
int &core_num_per_class,
const int input_box_num) {
uint32_t core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
uint32_t job_limit = getJobLimitCapability();
uint32_t core_number = job_limit;
int box_num_per_core = (input_box_num + core_number - 1) / core_number;
int use_job = selectType(job_limit, box_num_per_core);
// initiate k_type as Union1
k_dim->x = core_dim;
k_dim->y = 1;
k_dim->z = 1;
*k_type = CNRT_FUNC_TYPE_UNION1;
switch (job_limit) {
case CN_KERNEL_CLASS_BLOCK:
case CN_KERNEL_CLASS_UNION:
case CN_KERNEL_CLASS_UNION2:
case CN_KERNEL_CLASS_UNION4:
case CN_KERNEL_CLASS_UNION8:
case CN_KERNEL_CLASS_UNION16: {
if (use_job < 4) {
k_dim->x = 1;
*k_type = CNRT_FUNC_TYPE_BLOCK;
} else if (use_job == 4) {
k_dim->x = core_dim;
*k_type = CNRT_FUNC_TYPE_UNION1;
} else {
k_dim->x = use_job;
*k_type = (cnrtFunctionType_t)use_job;
}
}; break;
default:
LOG(WARNING) << "[cnnlNms_v2]: got unsupported job limit number."
<< " Use default CN_KERNEL_CLASS_UNION1 with UNION1 task.";
}
return CNNL_STATUS_SUCCESS;
}
#include "mlu_common_helper.h"
void IoU3DNMS3DMLUKernelLauncher(Tensor boxes, Tensor &keep, Tensor &keep_num,
float iou_threshold) {
// dimension parameters check
TORCH_CHECK(boxes.dim() == 2, "boxes should be a 2d tensor, got ",
boxes.dim(), "D");
TORCH_CHECK(boxes.size(1) == 7,
"boxes should have 7 elements in dimension 1, got ",
boxes.size(1));
// data type check
TORCH_CHECK(
boxes.scalar_type() == at::kFloat || boxes.scalar_type() == at::kHalf,
"data type of boxes should be Float or Half, got ", boxes.scalar_type());
if (boxes.numel() == 0) {
return;
}
const size_t max_input_num = 2147483648; // 2^31, 2G num
TORCH_CHECK(boxes.numel() < max_input_num,
"boxes.numel() should be less than 2147483648, got ",
boxes.numel());
int input_box_num = boxes.size(0);
cnrtDataType_t data_type_input = torch_mlu::toCnrtDtype(boxes.dtype());
cnrtDim3_t k_dim;
cnrtJobType_t k_type;
int core_num_per_class;
policyFunc(&k_dim, &k_type, core_num_per_class, input_box_num);
// transpose boxes (n, 7) to (7, n) for better performance
auto boxes_t = boxes.transpose(0, 1);
auto boxes_ = torch_mlu::cnnl::ops::cnnl_contiguous(boxes_t);
auto output = at::empty({input_box_num}, boxes.options().dtype(at::kLong));
int input_box_num = boxes.size(0);
auto boxes_ = torch_mlu::cnnl::ops::cnnl_contiguous(boxes);
auto output = keep.to(boxes.options().dtype(at::kInt));
auto output_size = at::empty({1}, boxes.options().dtype(at::kInt));
// workspace
const int info_num = 7; // x, y,z, dx, dy, dz,angle
size_t space_size = 0;
if (boxes.scalar_type() == at::kHalf) {
space_size = input_box_num * sizeof(int16_t) * info_num +
input_box_num * sizeof(float) + sizeof(float);
} else {
space_size = input_box_num * sizeof(float) * (info_num + 1) + sizeof(float);
}
MluOpTensorDescriptor boxes_desc, output_desc;
boxes_desc.set(boxes_);
output_desc.set(output);
auto workspace = at::empty(space_size, boxes.options().dtype(at::kByte));
// workspace
size_t workspace_size = 0;
auto handle = mluOpGetCurrentHandle();
TORCH_MLUOP_CHECK(mluOpGetNmsWorkspaceSize(handle, boxes_desc.desc(), NULL,
&workspace_size));
auto workspace = at::empty(workspace_size, boxes.options().dtype(at::kByte));
// get compute queue
auto queue = torch_mlu::getCurQueue();
auto boxes_impl = torch_mlu::getMluTensorImpl(boxes_);
auto boxes_ptr = boxes_impl->cnnlMalloc();
auto workspace_impl = torch_mlu::getMluTensorImpl(workspace);
......@@ -127,11 +44,29 @@ void IoU3DNMS3DMLUKernelLauncher(Tensor boxes, Tensor &keep, Tensor &keep_num,
auto output_size_impl = torch_mlu::getMluTensorImpl(keep_num);
auto output_size_ptr = output_size_impl->cnnlMalloc();
uint32_t core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
CNLOG(INFO) << "Launch Kernel KernelIou3d<<<Union" << k_type / core_dim
<< ", " << k_dim.x << ", " << k_dim.y << ", " << k_dim.z << ">>>";
KernelIou3d(k_dim, k_type, queue, data_type_input, boxes_ptr, input_box_num,
iou_threshold, workspace_ptr, output_size_ptr, output_ptr);
// nms desc
mluOpNmsDescriptor_t nms_desc;
const mluOpNmsBoxPointMode_t box_mode = (mluOpNmsBoxPointMode_t)0;
const mluOpNmsOutputMode_t output_mode = (mluOpNmsOutputMode_t)0;
const mluOpNmsAlgo_t algo = (mluOpNmsAlgo_t)0;
const mluOpNmsMethodMode_t method_mode = (mluOpNmsMethodMode_t)0;
const float soft_nms_sigma = 0.0;
const float confidence_threshold = 0.0;
const int input_layout = 0;
const bool pad_to_max_output_size = false;
const int max_output_size = input_box_num;
const float offset = 0.0;
TORCH_MLUOP_CHECK(mluOpCreateNmsDescriptor(&nms_desc));
TORCH_MLUOP_CHECK(mluOpSetNmsDescriptor(
nms_desc, box_mode, output_mode, algo, method_mode, iou_threshold,
soft_nms_sigma, max_output_size, confidence_threshold, offset,
input_layout, pad_to_max_output_size));
TORCH_MLUOP_CHECK(mluOpNms(handle, nms_desc, boxes_desc.desc(), boxes_ptr,
NULL, NULL, workspace_ptr, workspace_size,
output_desc.desc(), output_ptr, output_size_ptr));
TORCH_MLUOP_CHECK(mluOpDestroyNmsDescriptor(nms_desc));
}
void iou3d_nms3d_forward_mlu(const Tensor boxes, Tensor &keep, Tensor &keep_num,
......
/*************************************************************************
* Copyright (C) 2022 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "mlu_common_helper.h"
// Descriptors
mluOpDataType_t getMluOpDataType(const caffe2::TypeMeta& data_type) {
const std::map<std::string, mluOpDataType_t> mapping_type = {
{std::string("c10::Half"), MLUOP_DTYPE_HALF},
{std::string("float"), MLUOP_DTYPE_FLOAT},
{std::string("double"), MLUOP_DTYPE_DOUBLE},
{std::string("int8"), MLUOP_DTYPE_INT8},
{std::string("signed char"), MLUOP_DTYPE_INT8},
{std::string("short int"), MLUOP_DTYPE_INT16},
{std::string("short"), MLUOP_DTYPE_INT16},
{std::string("int"), MLUOP_DTYPE_INT32},
{std::string("long int"), MLUOP_DTYPE_INT64},
{std::string("long"), MLUOP_DTYPE_INT64},
{std::string("unsigned char"), MLUOP_DTYPE_UINT8},
{std::string("bool"), MLUOP_DTYPE_BOOL},
{std::string("c10::complex<c10::Half>"), MLUOP_DTYPE_COMPLEX_HALF},
{std::string("c10::complex<float>"), MLUOP_DTYPE_COMPLEX_FLOAT}};
if (mapping_type.find(std::string(data_type.name())) != mapping_type.end()) {
return mapping_type.find(std::string(data_type.name()))->second;
}
return MLUOP_DTYPE_INVALID;
}
// laytout
mluOpTensorLayout_t getMluOpSuggestLayout(const at::Tensor& input) {
auto suggest_memory_format = input.suggest_memory_format();
mluOpTensorLayout_t layout = MLUOP_LAYOUT_ARRAY;
switch (input.dim()) {
case 4:
layout = (suggest_memory_format == at::MemoryFormat::ChannelsLast)
? MLUOP_LAYOUT_NHWC
: MLUOP_LAYOUT_NCHW;
break;
case 5:
layout = (suggest_memory_format == at::MemoryFormat::ChannelsLast3d)
? MLUOP_LAYOUT_NDHWC
: MLUOP_LAYOUT_NCDHW;
break;
default:
layout = MLUOP_LAYOUT_ARRAY;
}
return layout;
}
mluOpReduceMode_t getMluOpReduceMode(const reduce_t reduce_type) {
const std::map<reduce_t, mluOpReduceMode_t> mapping_type = {
{reduce_t::MAX, MLUOP_REDUCE_DMAX},
{reduce_t::SUM, MLUOP_REDUCE_DSUM},
{reduce_t::MEAN, MLUOP_REDUCE_DMEAN}};
if (mapping_type.find(reduce_type) != mapping_type.end()) {
return mapping_type.find(reduce_type)->second;
} else {
TORCH_CHECK(false, "Unsupported reduce type: ", to_string(reduce_type));
return MLUOP_REDUCE_DSUM;
}
}
void MluOpTensorDescriptor::set(Tensor t) {
mluOpDataType_t data_type = getMluOpDataType(t.dtype());
mluOpTensorLayout_t layout = getMluOpSuggestLayout(t);
int t_dim = t.dim();
std::vector<int> dim_array;
if (t_dim == 0) {
dim_array.push_back(
1); // ScalarTensor(0-dim 1-item Tensor) view like size = 1 as default;
} else {
for (int i = 0; i < t_dim; i++) {
dim_array.push_back(static_cast<int>(t.sizes().vec()[i]));
}
}
set_desc(t, layout, data_type, dim_array);
}
void MluOpTensorDescriptor::set_with_layout(Tensor t,
mluOpTensorLayout_t layout) {
mluOpDataType_t data_type = getMluOpDataType(t.dtype());
int t_dim = t.dim();
std::vector<int> shape_info = checkUpperBoundAndCastTo<int>(t.sizes().vec());
std::vector<int> stride_info =
checkUpperBoundAndCastTo<int>(t.strides().vec());
if (layout == MLUOP_LAYOUT_NHWC || layout == MLUOP_LAYOUT_NDHWC ||
layout == MLUOP_LAYOUT_NLC) {
convertShapeAndStride(shape_info, stride_info);
} else if (layout == MLUOP_LAYOUT_HWCN) {
auto convertDepthWiseConvShapeStride = [](const std::vector<int64_t>& vec,
std::vector<int>& target_vec,
std::vector<int>& stride_vec) {
// NCHW --> HWCN
target_vec[0] = static_cast<int>(vec[2]);
target_vec[1] = static_cast<int>(vec[3]);
target_vec[2] = static_cast<int>(vec[1]);
target_vec[3] = static_cast<int>(vec[0]);
// Calculate Stride just like contiguous of HWCN.
stride_vec[3] = 1;
stride_vec[2] = target_vec[3] * stride_vec[3];
stride_vec[1] = target_vec[2] * stride_vec[2];
stride_vec[0] = target_vec[1] * stride_vec[1];
};
convertDepthWiseConvShapeStride(t.sizes().vec(), shape_info, stride_info);
}
TORCH_CHECK(mluOpSetTensorDescriptorEx(
desc_, layout, data_type, t_dim, shape_info.data(),
stride_info.data()) == MLUOP_STATUS_SUCCESS,
"mluOpSetTensorDescriptorEx execution failed.");
}
void MluOpTensorDescriptor::set_desc(const at::Tensor& t,
mluOpTensorLayout_t layout,
mluOpDataType_t dtype,
std::vector<int>& dims) {
int dimNb = dims.size();
TORCH_MLUOP_CHECK(
mluOpSetTensorDescriptor(desc_, layout, dtype, dimNb, dims.data()));
}
// Handles
std::once_flag mmcv_mluop_init_flag;
std::mutex mmcv_mluop_mutex;
static std::vector<MluOpHandle> mmcv_mluop_handles;
mluOpHandle_t mluOpGetCurrentHandle(c10::DeviceIndex device_index) {
std::call_once(mmcv_mluop_init_flag,
[]() // Init mmcv_mluop_handles 1-device <-> 1-handle
{
c10::DeviceIndex num_devices = torch_mlu::device_count();
mmcv_mluop_handles.resize(num_devices);
});
if (device_index == -1) {
device_index = torch_mlu::current_device();
}
std::lock_guard<std::mutex> mmcv_mluop_guard(mmcv_mluop_mutex);
auto queue = torch_mlu::getCurrentQueue(device_index).queue();
mmcv_mluop_handles[device_index].setQueue(queue);
return mmcv_mluop_handles[device_index].handle;
}
/*************************************************************************
* Copyright (C) 2022 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#pragma once
#include <ATen/ATen.h>
#include <c10/core/ScalarType.h>
#include "aten.h"
#include "mlu_op.h"
#include "pytorch_device_registry.hpp"
#define MLUOP_MAJOR 0
#define MLUOP_MINOR 8
#define MLUOP_PATCHLEVEL 1
/*************************************************************************
* This MACRO contains operations of simple tensor to mlu-tensor.
* _contiguous, _desc, _impl, _ptr will be automatically generated in
* this MACRO.
*************************************************************************/
#define INITIAL_MLU_PARAM_WITH_TENSOR(NAME) \
auto NAME##_contigous = torch_mlu::cnnl::ops::cnnl_contiguous( \
NAME, NAME.suggest_memory_format()); \
MluOpTensorDescriptor NAME##_desc; \
NAME##_desc.set(NAME##_contigous); \
auto NAME##_impl = torch_mlu::getMluTensorImpl(NAME##_contigous); \
auto NAME##_ptr = NAME##_impl->cnnlMalloc();
#ifndef TORCH_MLUOP_CHECK
#define TORCH_MLUOP_CHECK(EXPR) \
do { \
mluOpStatus_t status = EXPR; \
if (status != MLUOP_STATUS_SUCCESS) { \
CNLOG(ERROR) << ""; \
TORCH_CHECK(false, "MLUOPS error: ", mluOpGetErrorString(status)); \
} \
} while (0);
#endif
enum class reduce_t { SUM = 0, MEAN = 1, MAX = 2 };
inline std::string to_string(reduce_t reduce_type) {
if (reduce_type == reduce_t::MAX) {
return "max";
} else if (reduce_type == reduce_t::MEAN) {
return "mean";
} else if (reduce_type == reduce_t::SUM) {
return "sum";
} else {
return "unknown reduce type";
}
}
mluOpDataType_t getMluOpDataType(const caffe2::TypeMeta& data_type);
mluOpTensorLayout_t getMluOpSuggestLayout(const at::Tensor& input);
mluOpReduceMode_t getMluOpReduceMode(const reduce_t reduce_type);
class MluOpTensorDescriptor {
public:
MluOpTensorDescriptor() {
TORCH_MLUOP_CHECK(mluOpCreateTensorDescriptor(&desc_));
};
~MluOpTensorDescriptor() {
TORCH_MLUOP_CHECK(mluOpDestroyTensorDescriptor(desc_));
}
void set(at::Tensor);
void set_with_layout(at::Tensor, mluOpTensorLayout_t layout);
mluOpTensorDescriptor_t desc() { return desc_; }
private:
mluOpTensorDescriptor_t desc_;
void set_desc(const at::Tensor&, mluOpTensorLayout_t, mluOpDataType_t,
std::vector<int>& dims);
};
mluOpHandle_t mluOpGetCurrentHandle(c10::DeviceIndex device_index = -1);
class MluOpHandle {
public:
MluOpHandle() : handle(nullptr) { TORCH_MLUOP_CHECK(mluOpCreate(&handle)); }
~MluOpHandle() {
if (handle) {
TORCH_MLUOP_CHECK(mluOpDestroy(handle));
handle = nullptr;
}
}
void setQueue(cnrtQueue_t queue) {
TORCH_MLUOP_CHECK(mluOpSetQueue(handle, queue));
}
mluOpHandle_t handle;
};
// modify tensor size and stride order based on
// channels_first to channels_last or channels_last_3d.
// which this is not same with pytorch original layout,
// this real layout is based on data storage real order.
// example: modify channels_last tensor dim to nhwc tensor desc.
// N C H W --> N H W C
// C*H*W 1 W C --> C*H*W W C 1
template <typename T>
void convertShapeAndStride(std::vector<T>& shape_info,
std::vector<T>& stride_info) {
TORCH_MLU_CHECK(shape_info.size() == stride_info.size(),
"shape size need equal to stride size.");
const int dim = shape_info.size();
std::vector<T> temp_shape_info(dim);
std::vector<T> temp_stride_info(dim);
temp_shape_info[0] = shape_info[0];
temp_stride_info[0] = stride_info[0];
for (size_t i = 0; i < dim - 1; ++i) {
const int index = (i + 1) % (dim - 1) + 1;
temp_shape_info[i + 1] = shape_info[index];
temp_stride_info[i + 1] = stride_info[index];
}
shape_info.assign(temp_shape_info.begin(), temp_shape_info.end());
stride_info.assign(temp_stride_info.begin(), temp_stride_info.end());
}
// torch tensor provides int64_t type of shape and stride,
// but mluops descriptor requires type int32.
// use this function to ensure safe CAST, or report an error.
template <typename DST_T, typename SRC_T>
std::vector<DST_T> checkUpperBoundAndCastTo(const std::vector<SRC_T>& input) {
std::vector<DST_T> output;
output.reserve(input.size());
for (const auto& val : input) {
if (val > std::numeric_limits<DST_T>::max()) {
TORCH_MLU_CHECK(false, "Requires dim size not greater than ",
std::numeric_limits<DST_T>::max(), ". But got ", val,
".");
}
output.push_back(static_cast<DST_T>(val));
}
return output;
}
......@@ -9,396 +9,104 @@
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "mlu_common_helper.h"
#include "pytorch_device_registry.hpp"
#include "pytorch_mlu_helper.hpp"
#define MIN(a, b) (((a) < (b)) ? (a) : (b))
void KernelMsDeformAttnForward(
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
const cnrtDataType_t d_type, const char* data_value_gdram,
const char* data_spatial_shapes_gdram,
const char* data_level_start_index_gdram,
const char* data_sampling_loc_gdram, const char* data_attn_weight_gdram,
const int32_t batch_size, const int32_t num_keys, const int32_t num_heads,
const int32_t channels, const int32_t num_levels, const int32_t num_queries,
const int32_t num_points, char* data_col_gdram);
void KernelMsDeformAttnBackward(
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
const cnrtDataType_t d_type, const float* data_value,
const int32_t* spatial_shapes, const int32_t* data_level_start_index,
const float* data_sampling_loc, const float* data_attn_weight,
const float* grad_output, const int32_t batch_size, const int32_t num_keys,
const int32_t num_heads, const int32_t channels, const int32_t num_levels,
const int32_t num_queries, const int32_t num_points, float* grad_value,
float* grad_sampling_loc, float* grad_attn_weight);
// policy function
static void policyFuncForward(cnrtDim3_t* k_dim, cnrtFunctionType_t* k_type,
const int batch_size, const int num_queries,
const int num_heads) {
k_dim->x = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
k_dim->y =
MIN((batch_size * num_queries * num_heads + k_dim->x - 1) / k_dim->x,
torch_mlu::getDeviceAttr(cnrtAttrClusterCount));
k_dim->z = 1;
#if __BANG_ARCH__ == 520
*k_type = CNRT_FUNC_TYPE_BLOCK;
#else
*k_type = CNRT_FUNC_TYPE_UNION1;
#endif
}
// policy function for backward
static void policyFuncBackward(const int32_t batch_size,
const int32_t num_queries,
const int32_t num_heads,
const int32_t num_levels,
cnrtFunctionType_t* k_type, cnrtDim3_t* k_dim) {
size_t cluster_limit = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
size_t core_limit = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
k_dim->x = core_limit;
int32_t total_num = batch_size * num_queries * num_heads * num_levels;
size_t total_num_align = CEIL_ALIGN(total_num, core_limit);
k_dim->y = (total_num_align / core_limit) > cluster_limit
? cluster_limit
: (total_num_align / core_limit);
k_dim->z = 1;
*k_type = CNRT_FUNC_TYPE_UNION1;
}
Tensor ms_deform_attn_mlu_forward(const Tensor& value,
const Tensor& spatial_shapes,
const Tensor& level_start_index,
const Tensor& sampling_loc,
const Tensor& attn_weight,
const int im2col_step) {
// check contiguous
AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
AT_ASSERTM(spatial_shapes.is_contiguous(),
"spatial_shapes tensor has to be contiguous");
AT_ASSERTM(level_start_index.is_contiguous(),
"level_start_index tensor has to be contiguous");
AT_ASSERTM(sampling_loc.is_contiguous(),
"sampling_loc tensor has to be contiguous");
AT_ASSERTM(attn_weight.is_contiguous(),
"attn_weight tensor has to be contiguous");
// check datatype
TORCH_CHECK((value.scalar_type() == at::kFloat),
"value type should be Float, got ", value.scalar_type(), ".");
TORCH_CHECK((spatial_shapes.scalar_type() == at::kInt ||
spatial_shapes.scalar_type() == at::kLong),
"spatial_shapes type should be Int, got ",
spatial_shapes.scalar_type(), ".");
TORCH_CHECK((level_start_index.scalar_type() == at::kInt ||
level_start_index.scalar_type() == at::kLong),
"level_start_index type should be Int, got ",
level_start_index.scalar_type(), ".");
TORCH_CHECK((sampling_loc.scalar_type() == at::kFloat),
"sampling_loc type should be Float, got ",
sampling_loc.scalar_type(), ".");
TORCH_CHECK((attn_weight.scalar_type() == at::kFloat),
"attn_weight type should be Float, got ",
attn_weight.scalar_type(), ".");
// check shape
TORCH_CHECK(value.dim() == 4, "value should be a 4d tensor, got ",
value.dim(), "D.");
TORCH_CHECK(spatial_shapes.dim() == 2,
"spatial_shapes should be a 2d tensor, got ",
spatial_shapes.dim(), "D.");
TORCH_CHECK(level_start_index.dim() == 1,
"level_start_index should be a 1d tensor, got ",
level_start_index.dim(), "D.");
TORCH_CHECK(sampling_loc.dim() == 6,
"sampling_loc should be a 6d tensor, got ", sampling_loc.dim(),
"D.");
TORCH_CHECK(attn_weight.dim() == 5, "attn_weight should be a 5d tensor, got ",
attn_weight.dim(), "D.");
Tensor MsDeformAttnForwardLauncher(const Tensor& value,
const Tensor& spatial_shapes,
const Tensor& level_start_index,
const Tensor& sampling_loc,
const Tensor& attn_weight,
const int im2col_step) {
auto handle = mluOpGetCurrentHandle();
const int batch_size = value.size(0);
const int num_keys = value.size(1);
const int num_heads = value.size(2);
const int channels = value.size(3);
const int num_levels = spatial_shapes.size(0);
const int num_queries = sampling_loc.size(1);
const int num_points = sampling_loc.size(4);
TORCH_CHECK(spatial_shapes.size(1) == 2,
"the 2nd dimensions of spatial_shapes should be 2, got ",
spatial_shapes.size(1), ".");
TORCH_CHECK(sampling_loc.size(5) == 2,
"the 6th dimensions of sampling_loc should be 2, got ",
sampling_loc.size(5), ".");
TORCH_CHECK((sampling_loc.size(0) == batch_size),
"the 1st dimensions of sampling_loc should be batch_size, ",
"but now the 1st dimension of sampling_loc is ",
sampling_loc.size(0), ", and batch_size is ", batch_size, ".");
TORCH_CHECK((attn_weight.size(0) == batch_size),
"the 1st dimensions of attn_weight should be batch_size, ",
"but now the 1st dimension of attn_weight is ",
attn_weight.size(0), ", and batch_size is ", batch_size, ".");
TORCH_CHECK((sampling_loc.size(2) == num_heads),
"the 3rd dimensions of sampling_loc should be num_heads, ",
"but now the 3rd dimension of sampling_loc is ",
sampling_loc.size(2), ", and num_heads is ", num_heads, ".");
TORCH_CHECK((attn_weight.size(2) == num_heads),
"the 3rd dimensions of attn_weight should be num_heads, ",
"but now the 3rd dimension of attn_weight is ",
attn_weight.size(2), ", and num_heads is ", num_heads, ".");
TORCH_CHECK((level_start_index.size(0) == num_levels),
"the 1st dimensions of level_start_index should be num_levels, ",
"but now the 1st dimension of level_start_index is ",
level_start_index.size(0), ", and num_levels is ", num_levels,
".");
TORCH_CHECK((sampling_loc.size(3) == num_levels),
"the 4th dimensions of sampling_loc should be num_levels, ",
"but now the 4th dimension of sampling_loc is ",
sampling_loc.size(3), ", and num_levels is ", num_levels, ".");
TORCH_CHECK((attn_weight.size(3) == num_levels),
"the 4th dimensions of attn_weight should be num_levels, ",
"but now the 4th dimension of attn_weight is ",
attn_weight.size(3), ", and num_levels is ", num_levels, ".");
TORCH_CHECK((attn_weight.size(1) == num_queries),
"the 2nd dimensions of attn_weight should be num_queries, ",
"but now the 2nd dimension of attn_weight is ",
attn_weight.size(1), ", and num_queries is ", num_queries, ".");
TORCH_CHECK((attn_weight.size(4) == num_points),
"the 5th dimensions of attn_weight should be num_points, ",
"but now the 5th dimension of attn_weight is ",
attn_weight.size(4), ", and num_points is ", num_points, ".");
auto output = at::zeros({batch_size, num_queries, num_heads, channels},
value.options());
// large tensor check
const size_t max_input_size = 2147483648;
TORCH_CHECK(value.numel() < max_input_size,
"value element num should be less than 2^31, got ", value.numel(),
".");
TORCH_CHECK(sampling_loc.numel() < max_input_size,
"sampling_loc element num should be less than 2^31, got ",
sampling_loc.numel(), ".");
TORCH_CHECK(output.numel() < max_input_size,
"output element num should be less than 2^31, got ",
output.numel(), ".");
// check zero element
TORCH_CHECK(batch_size != 0, "batch_size should not be zero");
TORCH_CHECK(num_heads != 0, "num_heads should not be zero");
TORCH_CHECK(channels != 0, "channels should not be zero");
TORCH_CHECK(num_queries != 0, "num_queries should not be zero");
if (num_keys == 0 || num_levels == 0 || num_points == 0) {
return output;
}
// calculate task dimension
cnrtDim3_t k_dim;
cnrtFunctionType_t k_type;
policyFuncForward(&k_dim, &k_type, batch_size, num_queries, num_heads);
// get compute queue
auto queue = torch_mlu::getCurQueue();
auto spatial_shapes_ = spatial_shapes.to(at::kInt);
auto level_start_index_ = level_start_index.to(at::kInt);
// get ptr of tensors
auto value_impl = torch_mlu::getMluTensorImpl(value);
auto value_ptr = value_impl->cnnlMalloc();
auto spatial_shapes_impl = torch_mlu::getMluTensorImpl(spatial_shapes_);
auto spatial_shapes_ptr = spatial_shapes_impl->cnnlMalloc();
auto level_start_index_impl = torch_mlu::getMluTensorImpl(level_start_index_);
auto level_start_index_ptr = level_start_index_impl->cnnlMalloc();
auto sampling_loc_impl = torch_mlu::getMluTensorImpl(sampling_loc);
auto sampling_loc_ptr = sampling_loc_impl->cnnlMalloc();
auto attn_weight_impl = torch_mlu::getMluTensorImpl(attn_weight);
auto attn_weight_ptr = attn_weight_impl->cnnlMalloc();
auto output_impl = torch_mlu::getMluTensorImpl(output);
auto output_ptr = output_impl->cnnlMalloc();
// get compute dtype of input
cnrtDataType_t data_type = torch_mlu::toCnrtDtype(value.dtype());
// launch kernel
CNLOG(INFO) << "Launch Kernel MLUKernelMsDeformAttnForward<<<" << k_dim.x
<< ", " << k_dim.y << ", " << k_dim.z << ">>>";
KernelMsDeformAttnForward(
k_dim, k_type, queue, data_type, (char*)value_ptr,
(char*)spatial_shapes_ptr, (char*)level_start_index_ptr,
(char*)sampling_loc_ptr, (char*)attn_weight_ptr, batch_size, num_keys,
num_heads, channels, num_levels, num_queries, num_points,
(char*)output_ptr);
auto spatial_shapes_int = spatial_shapes.to(at::kInt);
auto level_start_index_int = level_start_index.to(at::kInt);
INITIAL_MLU_PARAM_WITH_TENSOR(output);
INITIAL_MLU_PARAM_WITH_TENSOR(value);
INITIAL_MLU_PARAM_WITH_TENSOR(spatial_shapes_int);
INITIAL_MLU_PARAM_WITH_TENSOR(level_start_index_int);
INITIAL_MLU_PARAM_WITH_TENSOR(sampling_loc);
INITIAL_MLU_PARAM_WITH_TENSOR(attn_weight);
TORCH_MLUOP_CHECK(mluOpMsDeformAttnForward(
handle, value_desc.desc(), value_ptr, spatial_shapes_int_desc.desc(),
spatial_shapes_int_ptr, level_start_index_int_desc.desc(),
level_start_index_int_ptr, sampling_loc_desc.desc(), sampling_loc_ptr,
attn_weight_desc.desc(), attn_weight_ptr, im2col_step, output_desc.desc(),
output_ptr));
output = output.view({batch_size, num_queries, num_heads * channels});
return output;
}
void ms_deform_attn_mlu_backward(
void MsDeformAttnBackwardLauncher(
const Tensor& value, const Tensor& spatial_shapes,
const Tensor& level_start_index, const Tensor& sampling_loc,
const Tensor& attn_weight, const Tensor& grad_output, Tensor& grad_value,
Tensor& grad_sampling_loc, Tensor& grad_attn_weight,
const int im2col_step) {
// check contiguous
AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
AT_ASSERTM(spatial_shapes.is_contiguous(),
"spatial_shapes tensor has to be contiguous");
AT_ASSERTM(level_start_index.is_contiguous(),
"level_start_index tensor has to be contiguous");
AT_ASSERTM(sampling_loc.is_contiguous(),
"sampling_loc tensor has to be contiguous");
AT_ASSERTM(attn_weight.is_contiguous(),
"attn_weight tensor has to be contiguous");
AT_ASSERTM(grad_output.is_contiguous(),
"grad_output tensor has to be contiguous");
// check datatype
TORCH_CHECK((value.scalar_type() == at::kFloat),
"value type should be Float, got ", value.scalar_type(), ".");
TORCH_CHECK((spatial_shapes.scalar_type() == at::kInt ||
spatial_shapes.scalar_type() == at::kLong),
"spatial_shapes type should be Int, got ",
spatial_shapes.scalar_type(), ".");
TORCH_CHECK((level_start_index.scalar_type() == at::kInt ||
level_start_index.scalar_type() == at::kLong),
"level_start_index type should be Int, got ",
level_start_index.scalar_type(), ".");
TORCH_CHECK((sampling_loc.scalar_type() == at::kFloat),
"sampling_loc type should be Float, got ",
sampling_loc.scalar_type(), ".");
TORCH_CHECK((attn_weight.scalar_type() == at::kFloat),
"attn_weight type should be Float, got ",
attn_weight.scalar_type(), ".");
TORCH_CHECK((grad_output.scalar_type() == at::kFloat),
"grad_output type should be Float, got ",
grad_output.scalar_type(), ".");
auto handle = mluOpGetCurrentHandle();
auto spatial_shapes_int = spatial_shapes.to(at::kInt);
auto level_start_index_int = level_start_index.to(at::kInt);
const int batch_size = value.size(0);
const int num_keys = value.size(1);
const int num_heads = value.size(2);
const int channels = value.size(3);
const int num_levels = spatial_shapes.size(0);
const int num_queries = sampling_loc.size(1);
const int num_points = sampling_loc.size(4);
// Check shape.
TORCH_CHECK(spatial_shapes.size(1) == 2,
"the 2nd dimensions of spatial_shapes should be 2, got ",
spatial_shapes.size(1), ".");
TORCH_CHECK((level_start_index.size(0) == num_levels),
"the 1st dimensions of level_start_index should be num_levels, ",
"but now the 1st dimension of level_start_index is ",
level_start_index.size(0), ", and num_levels is ", num_levels,
".");
TORCH_CHECK((sampling_loc.size(0) == batch_size),
"the 1st dimensions of sampling_loc should be batch_size, ",
"but now the 1st dimension of sampling_loc is ",
sampling_loc.size(0), ", and batch_size is ", batch_size, ".");
TORCH_CHECK((sampling_loc.size(2) == num_heads),
"the 3rd dimensions of sampling_loc should be num_heads, ",
"but now the 3rd dimension of sampling_loc is ",
sampling_loc.size(2), ", and num_heads is ", num_heads, ".");
TORCH_CHECK((sampling_loc.size(3) == num_levels),
"the 4th dimensions of sampling_loc should be num_levels, ",
"but now the 4th dimension of sampling_loc is ",
sampling_loc.size(3), ", and num_levels is ", num_levels, ".");
TORCH_CHECK(sampling_loc.size(5) == 2,
"the 6th dimensions of sampling_loc should be 2, got ",
sampling_loc.size(5), ".");
TORCH_CHECK((attn_weight.size(0) == batch_size),
"the 1st dimensions of attn_weight should be batch_size, ",
"but now the 1st dimension of attn_weight is ",
attn_weight.size(0), ", and batch_size is ", batch_size, ".");
TORCH_CHECK((attn_weight.size(1) == num_queries),
"the 2nd dimensions of attn_weight should be num_queries, ",
"but now the 2nd dimension of attn_weight is ",
attn_weight.size(1), ", and num_queries is ", num_queries, ".");
TORCH_CHECK((attn_weight.size(2) == num_heads),
"the 3rd dimensions of attn_weight should be num_heads, ",
"but now the 3rd dimension of attn_weight is ",
attn_weight.size(2), ", and num_heads is ", num_heads, ".");
TORCH_CHECK((attn_weight.size(3) == num_levels),
"the 4th dimensions of attn_weight should be num_levels, ",
"but now the 4th dimension of attn_weight is ",
attn_weight.size(3), ", and num_levels is ", num_levels, ".");
TORCH_CHECK((attn_weight.size(4) == num_points),
"the 5th dimensions of attn_weight should be num_points, ",
"but now the 5th dimension of attn_weight is ",
attn_weight.size(4), ", and num_points is ", num_points, ".");
TORCH_CHECK((grad_output.size(0) == batch_size),
"the 1st dimensions of grad_output should be batch_size, ",
"but now the 1st dimension of grad_output is ",
grad_output.size(0), ", and batch_size is ", batch_size, ".");
TORCH_CHECK((grad_output.size(1) == num_queries),
"the 2nd dimensions of grad_output should be num_queries, ",
"but now the 2nd dimension of grad_output is ",
grad_output.size(1), ", and num_queries is ", num_queries, ".");
TORCH_CHECK(
(grad_output.size(2) == num_heads * channels),
"the 3rd dimensions of grad_output should be num_heads * channels, ",
"but now the 3rd dimension of grad_output is ", grad_output.size(2),
", and num_heads * channels is ", num_heads * channels, ".");
// check zero element
TORCH_CHECK(batch_size != 0, "The batch_size is zero.");
TORCH_CHECK(channels != 0, "The channels is zero.");
TORCH_CHECK(num_keys != 0, "The num_keys is zero.");
TORCH_CHECK(num_heads != 0, "The num_heads is zero.");
TORCH_CHECK(num_queries != 0, "The num_queries is zero.");
if (num_levels == 0 || num_points == 0) {
return;
}
// calculate task dimension
cnrtDim3_t k_dim;
cnrtFunctionType_t k_type;
policyFuncBackward(batch_size, num_queries, num_heads, num_levels, &k_type,
&k_dim);
// get compute queue
auto queue = torch_mlu::getCurQueue();
// get ptr of tensors
auto value_impl = torch_mlu::getMluTensorImpl(value);
auto value_ptr = value_impl->cnnlMalloc();
auto spatial_shapes_impl = torch_mlu::getMluTensorImpl(spatial_shapes);
auto spatial_shapes_ptr = spatial_shapes_impl->cnnlMalloc();
auto level_start_index_impl = torch_mlu::getMluTensorImpl(level_start_index);
auto level_start_index_ptr = level_start_index_impl->cnnlMalloc();
auto sampling_loc_impl = torch_mlu::getMluTensorImpl(sampling_loc);
auto sampling_loc_ptr = sampling_loc_impl->cnnlMalloc();
auto attn_weight_impl = torch_mlu::getMluTensorImpl(attn_weight);
auto attn_weight_ptr = attn_weight_impl->cnnlMalloc();
auto grad_output_impl = torch_mlu::getMluTensorImpl(grad_output);
auto grad_output_ptr = grad_output_impl->cnnlMalloc();
auto grad_value_impl = torch_mlu::getMluTensorImpl(grad_value);
auto grad_value_ptr = grad_value_impl->cnnlMalloc();
auto grad_sampling_loc_impl = torch_mlu::getMluTensorImpl(grad_sampling_loc);
auto grad_sampling_loc_ptr = grad_sampling_loc_impl->cnnlMalloc();
auto grad_attn_weight_impl = torch_mlu::getMluTensorImpl(grad_attn_weight);
auto grad_attn_weight_ptr = grad_attn_weight_impl->cnnlMalloc();
// get comput dtype of input
cnrtDataType_t data_type = torch_mlu::toCnrtDtype(value.dtype());
auto grad_output_dim4 =
grad_output.view({batch_size, num_queries, num_heads, channels});
// auto grad_output_dim4 = grad_output.view({batch_size, num_queries,
// num_heads, channels}).detach();
INITIAL_MLU_PARAM_WITH_TENSOR(value);
INITIAL_MLU_PARAM_WITH_TENSOR(spatial_shapes_int);
INITIAL_MLU_PARAM_WITH_TENSOR(level_start_index_int);
INITIAL_MLU_PARAM_WITH_TENSOR(sampling_loc);
INITIAL_MLU_PARAM_WITH_TENSOR(attn_weight);
INITIAL_MLU_PARAM_WITH_TENSOR(grad_output_dim4);
// INITIAL_MLU_PARAM_WITH_TENSOR(grad_output);
INITIAL_MLU_PARAM_WITH_TENSOR(grad_value);
INITIAL_MLU_PARAM_WITH_TENSOR(grad_sampling_loc);
INITIAL_MLU_PARAM_WITH_TENSOR(grad_attn_weight);
mluOpMsDeformAttnBackward(
handle, value_desc.desc(), value_ptr, spatial_shapes_int_desc.desc(),
spatial_shapes_int_ptr, level_start_index_int_desc.desc(),
level_start_index_int_ptr, sampling_loc_desc.desc(), sampling_loc_ptr,
attn_weight_desc.desc(), attn_weight_ptr, grad_output_dim4_desc.desc(),
grad_output_dim4_ptr, im2col_step, grad_value_desc.desc(), grad_value_ptr,
grad_sampling_loc_desc.desc(), grad_sampling_loc_ptr,
grad_attn_weight_desc.desc(), grad_attn_weight_ptr);
return;
}
// launch kernel
CNLOG(INFO) << "Launch Kernel MLUKernelMsDeformAttnBackward<<<" << k_dim.x
<< ", " << k_dim.y << ", " << k_dim.z << ">>>";
Tensor ms_deform_attn_mlu_forward(const Tensor& value,
const Tensor& spatial_shapes,
const Tensor& level_start_index,
const Tensor& sampling_loc,
const Tensor& attn_weight,
const int im2col_step) {
return MsDeformAttnForwardLauncher(value, spatial_shapes, level_start_index,
sampling_loc, attn_weight, im2col_step);
}
KernelMsDeformAttnBackward(
k_dim, k_type, queue, data_type, (float*)value_ptr,
(int32_t*)spatial_shapes_ptr, (int32_t*)level_start_index_ptr,
(float*)sampling_loc_ptr, (float*)attn_weight_ptr,
(float*)grad_output_ptr, batch_size, num_keys, num_heads, channels,
num_levels, num_queries, num_points, (float*)grad_value_ptr,
(float*)grad_sampling_loc_ptr, (float*)grad_attn_weight_ptr);
void ms_deform_attn_mlu_backward(
const Tensor& value, const Tensor& spatial_shapes,
const Tensor& level_start_index, const Tensor& sampling_loc,
const Tensor& attn_weight, const Tensor& grad_output, Tensor& grad_value,
Tensor& grad_sampling_loc, Tensor& grad_attn_weight,
const int im2col_step) {
return MsDeformAttnBackwardLauncher(value, spatial_shapes, level_start_index,
sampling_loc, attn_weight, grad_output,
grad_value, grad_sampling_loc,
grad_attn_weight, im2col_step);
}
Tensor ms_deform_attn_impl_forward(const Tensor& value,
......@@ -416,5 +124,6 @@ void ms_deform_attn_impl_backward(
REGISTER_DEVICE_IMPL(ms_deform_attn_impl_forward, MLU,
ms_deform_attn_mlu_forward);
REGISTER_DEVICE_IMPL(ms_deform_attn_impl_backward, MLU,
ms_deform_attn_mlu_backward);
......@@ -10,123 +10,35 @@
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "pytorch_device_registry.hpp"
#include "pytorch_mlu_helper.hpp"
void KernelNms(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
const cnrtDataType_t data_type_input, const void *boxes_ptr,
const void *scores_ptr, const int input_num_boxes,
const int max_output_boxes, const float iou_threshold,
const float offset, void *workspace_ptr, void *output_size_ptr,
void *output_ptr);
int selectUnionType(uint32_t use_job, int box_num_per_core) {
// the box_num_per_core should be at least 256, otherwise the real IO
// bandwidth would be very low
while (box_num_per_core < 256 && use_job >= 4) {
box_num_per_core *= 2;
use_job /= 2;
}
return use_job;
}
static cnnlStatus_t policyFunc(cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type,
int &core_num_per_class,
const int input_box_num) {
uint32_t core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
uint32_t cluster_number = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
uint32_t job_limit = getJobLimitCapability();
uint32_t core_number = job_limit;
int box_num_per_core = (input_box_num + core_number - 1) / core_number;
int use_job = selectUnionType(job_limit, box_num_per_core);
// initiate k_type as Union1
k_dim->x = core_dim;
k_dim->y = 1;
k_dim->z = 1;
*k_type = CNRT_FUNC_TYPE_UNION1;
switch (job_limit) {
case CN_KERNEL_CLASS_BLOCK:
case CN_KERNEL_CLASS_UNION:
case CN_KERNEL_CLASS_UNION2:
case CN_KERNEL_CLASS_UNION4:
case CN_KERNEL_CLASS_UNION8:
case CN_KERNEL_CLASS_UNION16: {
if (use_job < 4) {
k_dim->x = 1;
*k_type = CNRT_FUNC_TYPE_BLOCK;
} else if (use_job == 4) {
k_dim->x = core_dim;
*k_type = CNRT_FUNC_TYPE_UNION1;
} else {
k_dim->x = use_job;
*k_type = (cnrtFunctionType_t)use_job;
}
}; break;
default:
LOG(WARNING) << "[cnnlNms_v2]: got unsupported job limit number."
<< " Use default CN_KERNEL_CLASS_UNION1 with UNION1 task.";
}
return CNNL_STATUS_SUCCESS;
}
#include "mlu_common_helper.h"
Tensor NMSMLUKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
int offset) {
// dimension parameters check
TORCH_CHECK(boxes.dim() == 2, "boxes should be a 2d tensor, got ",
boxes.dim(), "D");
TORCH_CHECK(boxes.size(1) == 4,
"boxes should have 4 elements in dimension 1, got ",
boxes.size(1));
TORCH_CHECK(scores.dim() == 1, "scores should be a 1d tensor, got ",
scores.dim(), "D");
// data type check
TORCH_CHECK(boxes.scalar_type() == scores.scalar_type(),
"boxes should have the same type as scores");
TORCH_CHECK(
boxes.scalar_type() == at::kFloat || boxes.scalar_type() == at::kHalf,
"data type of boxes should be Float or Half, got ", boxes.scalar_type());
if (boxes.numel() == 0) {
return at::empty({0}, boxes.options().dtype(at::kLong));
}
int input_num_boxes = boxes.size(0);
int max_output_boxes = boxes.size(0);
cnrtDataType_t data_type_input = torch_mlu::toCnrtDtype(boxes.dtype());
cnrtDim3_t k_dim;
cnrtJobType_t k_type;
int core_num_per_class;
policyFunc(&k_dim, &k_type, core_num_per_class, input_num_boxes);
// transpose boxes (n, 4) to (4, n) for better performance
auto boxes_t = boxes.transpose(0, 1);
auto boxes_ = torch_mlu::cnnl::ops::cnnl_contiguous(boxes_t);
auto boxes_ = torch_mlu::cnnl::ops::cnnl_contiguous(boxes);
auto scores_ = torch_mlu::cnnl::ops::cnnl_contiguous(scores);
auto output = at::empty({max_output_boxes}, boxes.options().dtype(at::kLong));
auto output = at::empty({max_output_boxes}, boxes.options().dtype(at::kInt));
auto output_size = at::empty({1}, scores.options().dtype(at::kInt));
MluOpTensorDescriptor boxes_desc, scores_desc, output_desc;
boxes_desc.set(boxes_);
scores_desc.set(scores_);
output_desc.set(output);
// workspace
const int info_num = 5; // x1, x2, y1, y2 and score
size_t space_size = 0;
if (boxes.scalar_type() == at::kHalf) {
space_size = input_num_boxes * sizeof(int16_t) * info_num + sizeof(float);
} else {
space_size = input_num_boxes * sizeof(float) * info_num + sizeof(float);
}
#if __BANG_ARCH__ > 370
int cluster_num = getCoreNumOfJobLimitCapability() /
torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
space_size += cluster_number * sizeof(float) * 7;
#endif
auto workspace = at::empty(space_size, boxes.options().dtype(at::kByte));
size_t workspace_size = 0;
auto handle = mluOpGetCurrentHandle();
TORCH_MLUOP_CHECK(mluOpGetNmsWorkspaceSize(
handle, boxes_desc.desc(), scores_desc.desc(), &workspace_size));
auto workspace = at::empty(workspace_size, boxes.options().dtype(at::kByte));
// get compute queue
auto queue = torch_mlu::getCurQueue();
auto boxes_impl = torch_mlu::getMluTensorImpl(boxes_);
auto boxes_ptr = boxes_impl->cnnlMalloc();
auto scores_impl = torch_mlu::getMluTensorImpl(scores_);
......@@ -138,14 +50,32 @@ Tensor NMSMLUKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
auto output_size_impl = torch_mlu::getMluTensorImpl(output_size);
auto output_size_ptr = output_size_impl->cnnlMalloc();
uint32_t core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
CNLOG(INFO) << "Launch Kernel MLUUnionX NMS<<<Union" << k_type / core_dim
<< ", " << k_dim.x << ", " << k_dim.y << ", " << k_dim.z << ">>>";
KernelNms(k_dim, k_type, queue, data_type_input, boxes_ptr, scores_ptr,
input_num_boxes, max_output_boxes, iou_threshold, offset,
workspace_ptr, output_size_ptr, output_ptr);
// nms desc
mluOpNmsDescriptor_t nms_desc;
const mluOpNmsBoxPointMode_t box_mode = (mluOpNmsBoxPointMode_t)0;
const mluOpNmsOutputMode_t output_mode = (mluOpNmsOutputMode_t)0;
const mluOpNmsAlgo_t algo = (mluOpNmsAlgo_t)0;
const mluOpNmsMethodMode_t method_mode = (mluOpNmsMethodMode_t)0;
const float soft_nms_sigma = 0.0;
const float confidence_threshold = 0.0;
const int input_layout = 0;
const bool pad_to_max_output_size = false;
const int max_output_size = max_output_boxes;
TORCH_MLUOP_CHECK(mluOpCreateNmsDescriptor(&nms_desc));
TORCH_MLUOP_CHECK(mluOpSetNmsDescriptor(
nms_desc, box_mode, output_mode, algo, method_mode, iou_threshold,
soft_nms_sigma, max_output_size, confidence_threshold, (float)offset,
input_layout, pad_to_max_output_size));
TORCH_MLUOP_CHECK(mluOpNms(handle, nms_desc, boxes_desc.desc(), boxes_ptr,
scores_desc.desc(), scores_ptr, workspace_ptr,
workspace_size, output_desc.desc(), output_ptr,
output_size_ptr));
TORCH_MLUOP_CHECK(mluOpDestroyNmsDescriptor(nms_desc));
int output_num = *static_cast<int *>(output_size.cpu().data_ptr());
return output.slice(0, 0, output_num);
auto ret = output.to(boxes.options().dtype(at::kLong));
return ret.slice(0, 0, output_num);
}
Tensor nms_mlu(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
......
/*************************************************************************
* Copyright (C) 2021 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "mlu_common_helper.h"
Tensor nms_rotated_mlu(Tensor boxes, Tensor scores, float iou_threshold) {
if (boxes.numel() == 0) {
return at::empty({0}, boxes.options().dtype(at::kLong));
}
int boxes_num = boxes.size(0);
auto boxes_ = torch_mlu::cnnl::ops::cnnl_contiguous(boxes);
auto scores_ = torch_mlu::cnnl::ops::cnnl_contiguous(scores);
auto output = at::empty({boxes_num}, boxes.options().dtype(at::kInt));
auto output_size = at::empty({1}, scores.options().dtype(at::kInt));
MluOpTensorDescriptor boxes_desc, scores_desc, output_desc;
boxes_desc.set(boxes_);
scores_desc.set(scores_);
output_desc.set(output);
// workspace
size_t workspace_size = 0;
auto handle = mluOpGetCurrentHandle();
TORCH_MLUOP_CHECK(mluOpGetNmsRotatedWorkspaceSize(handle, boxes_desc.desc(),
&workspace_size));
auto workspace = at::empty(workspace_size, boxes.options().dtype(at::kByte));
auto boxes_impl = torch_mlu::getMluTensorImpl(boxes_);
auto boxes_ptr = boxes_impl->cnnlMalloc();
auto scores_impl = torch_mlu::getMluTensorImpl(scores_);
auto scores_ptr = scores_impl->cnnlMalloc();
auto workspace_impl = torch_mlu::getMluTensorImpl(workspace);
auto workspace_ptr = workspace_impl->cnnlMalloc();
auto output_impl = torch_mlu::getMluTensorImpl(output);
auto output_ptr = output_impl->cnnlMalloc();
auto output_size_impl = torch_mlu::getMluTensorImpl(output_size);
auto output_size_ptr = output_size_impl->cnnlMalloc();
TORCH_MLUOP_CHECK(mluOpNmsRotated(
handle, iou_threshold, boxes_desc.desc(), boxes_ptr, scores_desc.desc(),
scores_ptr, workspace_ptr, workspace_size, output_desc.desc(), output_ptr,
(int *)output_size_ptr));
int output_num = *static_cast<int *>(output_size.cpu().data_ptr());
auto ret = output.to(boxes.options().dtype(at::kLong));
return ret.slice(0, 0, output_num);
}
......@@ -9,136 +9,7 @@
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include <algorithm>
#include "psamask_utils.hpp"
#include "pytorch_device_registry.hpp"
#include "pytorch_mlu_helper.hpp"
#define COMPUTE_COUNT_ALIGN 64
void KernelPsamaskForward(
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
const void *x, void *y, const PsamaskType psa_type,
const DimPartitionType core_partition,
const DimPartitionType cluster_partition, const int batch,
const int h_feature, const int w_feature, const int h_mask,
const int w_mask, const int x_c, const int y_c, const int half_h_mask,
const int half_w_mask, const int n_per_core, const int h_per_core,
const int n_per_cluster, const int h_per_cluster, const int limit_n_seg,
const int limit_h_seg, const int limit_w_seg);
void KernelPsamaskBackward(
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
const void *dy, void *dx, const PsamaskType psa_type,
const DimPartitionType core_partition,
const DimPartitionType cluster_partition, const int batch,
const int h_feature, const int w_feature, const int h_mask,
const int w_mask, const int dx_c, const int dy_c, const int half_h_mask,
const int half_w_mask, const int n_per_core, const int h_per_core,
const int n_per_cluster, const int h_per_cluster, const int limit_n_seg,
const int limit_h_seg, const int limit_w_seg);
namespace {
void policyFunc(cnrtDim3_t *k_dim_ptr, cnrtFunctionType_t *f_type_ptr,
PartitionSeg *partition_ptr, const int n, const int h_feature) {
unsigned int core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
unsigned int cluster_num = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
unsigned int use_cluster_num = cluster_num;
unsigned int use_core_num = core_dim;
if (n >= cluster_num || n >= h_feature) {
partition_ptr->cluster_partition = PARTITION_N;
partition_ptr->n_per_cluster = (n + cluster_num - 1) / cluster_num;
partition_ptr->h_per_cluster = h_feature;
use_cluster_num =
(n + partition_ptr->n_per_cluster - 1) / partition_ptr->n_per_cluster;
} else {
partition_ptr->cluster_partition = PARTITION_H;
partition_ptr->h_per_cluster = (h_feature + cluster_num - 1) / cluster_num;
partition_ptr->n_per_cluster = n;
use_cluster_num = (h_feature + partition_ptr->h_per_cluster - 1) /
partition_ptr->h_per_cluster;
}
if (partition_ptr->n_per_cluster >= core_dim ||
partition_ptr->n_per_cluster >= partition_ptr->h_per_cluster) {
partition_ptr->core_partition = PARTITION_N;
partition_ptr->n_per_core =
(partition_ptr->n_per_cluster + core_dim - 1) / core_dim;
partition_ptr->h_per_core = partition_ptr->h_per_cluster;
use_core_num =
(partition_ptr->n_per_cluster + partition_ptr->n_per_core - 1) /
partition_ptr->n_per_core;
} else {
partition_ptr->core_partition = PARTITION_H;
partition_ptr->h_per_core =
(partition_ptr->h_per_cluster + core_dim - 1) / core_dim;
partition_ptr->n_per_core = partition_ptr->n_per_cluster;
use_core_num =
(partition_ptr->h_per_cluster + partition_ptr->h_per_core - 1) /
partition_ptr->h_per_core;
}
*k_dim_ptr = {core_dim, use_cluster_num, 1};
}
} // namespace
bool findLimit(const int shape_core_n, const int shape_core_h,
const int shape_core_w, const int shape_core_ci,
const int shape_core_co, int *limit_n_seg_ptr,
int *limit_h_seg_ptr, int *limit_w_seg_ptr, const int psa_type) {
const bool need_temp = psa_type == 1;
const int input_bytes = sizeof(float);
int limit_n_seg = shape_core_n;
int limit_h_seg = shape_core_h;
int limit_w_seg = shape_core_w;
const int max_nram_size = torch_mlu::getDeviceAttr(cnrtAttrNramSizePerMcore);
const int align_base_128 = NFU_ALIGN_SIZE / input_bytes;
const int align_base_64 = COMPUTE_COUNT_ALIGN / input_bytes;
const int align_co = CEIL_ALIGN(shape_core_co, align_base_64);
const int align_w = CEIL_ALIGN(shape_core_w, align_base_64);
const int align_hw = CEIL_ALIGN(shape_core_h * shape_core_w, align_base_64);
const int max_num = max_nram_size / input_bytes;
int n_limit =
max_num /
(CEIL_ALIGN(shape_core_h * shape_core_w * shape_core_ci, align_base_128) +
align_hw * align_co * (1 + need_temp));
if (n_limit > 0) {
n_limit = std::min(n_limit, shape_core_n);
limit_n_seg = n_limit;
} else {
int h_limit =
max_num / (CEIL_ALIGN(shape_core_w * shape_core_ci, align_base_128) +
align_w * align_co * (1 + need_temp));
if (h_limit > 0) {
h_limit = std::min(h_limit, shape_core_h);
limit_h_seg = h_limit;
limit_n_seg = 1;
} else {
int w_limit =
max_num / (CEIL_ALIGN(shape_core_ci, align_base_128) +
CEIL_ALIGN(align_co, align_base_128) * (1 + need_temp));
if (w_limit > 0 && w_limit >= (COMPUTE_COUNT_ALIGN / input_bytes)) {
w_limit = std::min(w_limit, shape_core_w);
w_limit = w_limit / (COMPUTE_COUNT_ALIGN / input_bytes) *
(COMPUTE_COUNT_ALIGN / input_bytes);
limit_w_seg = w_limit;
limit_h_seg = 1;
limit_n_seg = 1;
} else {
CNLOG(INFO) << "The size of input channel is too large.";
return false;
}
}
}
*limit_n_seg_ptr = limit_n_seg;
*limit_h_seg_ptr = limit_h_seg;
*limit_w_seg_ptr = limit_w_seg;
return true;
}
#include "mlu_common_helper.h"
void PSAMaskForwardMLUKernelLauncher(const int psa_type, const Tensor x,
Tensor y, const int num_,
......@@ -146,39 +17,7 @@ void PSAMaskForwardMLUKernelLauncher(const int psa_type, const Tensor x,
const int h_mask, const int w_mask,
const int half_h_mask,
const int half_w_mask) {
// params check
TORCH_CHECK(x.scalar_type() == at::kFloat, "x type should be Float, got ",
x.scalar_type());
TORCH_CHECK(y.scalar_type() == x.scalar_type(),
"y should have the same type as x");
TORCH_CHECK(x.dim() == 4, "x should be a 4d tensor, got ", x.dim(), "D");
TORCH_CHECK(y.dim() == 4, "y should be a 4d tensor, got ", y.dim(), "D");
int x_c = x.size(1);
int y_c = y.size(1);
TORCH_CHECK(h_mask * w_mask == x_c,
"channel of x should be the same as h_mask * w_mask");
TORCH_CHECK(h_feature * w_feature == y_c,
"channel of y should be the same as h_feature * w_feature");
TORCH_CHECK(psa_type == 0 || psa_type == 1,
"psa_type only supports 'COLLECT' and 'DISTRIBUTE' currently");
if (x.numel() == 0) {
CNLOG(INFO) << "skip zero-element tensor";
return;
}
cnrtFunctionType_t k_type = CNRT_FUNC_TYPE_UNION1;
cnrtDim3_t k_dim;
PartitionSeg partition_info;
policyFunc(&k_dim, &k_type, &partition_info, num_, h_feature);
int n_limit_seg, h_limit_seg, w_limit_seg;
bool ret =
findLimit(partition_info.n_per_core, partition_info.h_per_core, w_feature,
x_c, y_c, &n_limit_seg, &h_limit_seg, &w_limit_seg, psa_type);
if (ret != true) {
return;
}
auto memory_format =
torch_mlu::cnnl::ops::get_channels_last_memory_format(x.dim());
......@@ -186,22 +25,18 @@ void PSAMaskForwardMLUKernelLauncher(const int psa_type, const Tensor x,
at::Tensor y_tmp =
at::empty({num_, y_c, h_feature, w_feature}, x.options(), memory_format);
// get compute queue
auto queue = torch_mlu::getCurQueue();
MluOpTensorDescriptor x_desc, y_desc;
x_desc.set_with_layout(x_tensor, MLUOP_LAYOUT_NHWC);
y_desc.set_with_layout(y_tmp, MLUOP_LAYOUT_NHWC);
// get ptr of tensors
auto handle = mluOpGetCurrentHandle();
auto x_impl = torch_mlu::getMluTensorImpl(x_tensor);
auto x_ptr = x_impl->cnnlMalloc();
auto y_impl = torch_mlu::getMluTensorImpl(y_tmp);
auto y_ptr = y_impl->cnnlMalloc();
KernelPsamaskForward(
k_dim, k_type, queue, x_ptr, y_ptr, (PsamaskType)psa_type,
partition_info.core_partition, partition_info.cluster_partition, num_,
h_feature, w_feature, h_mask, w_mask, x_c, y_c, half_h_mask, half_w_mask,
partition_info.n_per_core, partition_info.h_per_core,
partition_info.n_per_cluster, partition_info.h_per_cluster, n_limit_seg,
h_limit_seg, w_limit_seg);
TORCH_MLUOP_CHECK(mluOpPsamaskForward(handle, psa_type, x_desc.desc(), x_ptr,
h_mask, w_mask, y_desc.desc(), y_ptr));
y.copy_(y_tmp);
}
......@@ -212,39 +47,7 @@ void PSAMaskBackwardMLUKernelLauncher(const int psa_type, const Tensor dy,
const int h_mask, const int w_mask,
const int half_h_mask,
const int half_w_mask) {
// params check
TORCH_CHECK(dy.scalar_type() == at::kFloat, "dy type should be Float, got ",
dy.scalar_type());
TORCH_CHECK(dx.scalar_type() == dy.scalar_type(),
"dx should have the same type as dy");
TORCH_CHECK(dy.dim() == 4, "dy should be a 4d tensor, got ", dy.dim(), "D");
TORCH_CHECK(dx.dim() == 4, "dx should be a 4d tensor, got ", dx.dim(), "D");
int dy_c = dy.size(1);
int dx_c = dx.size(1);
TORCH_CHECK(h_feature * w_feature == dy_c,
"channel of dy should be the same as h_feature * w_feature");
TORCH_CHECK(h_mask * w_mask == dx_c,
"channel of dx should be the same as h_mask * w_mask");
TORCH_CHECK(psa_type == 0 || psa_type == 1,
"psa_type only supports 'COLLECT' and 'DISTRIBUTE' currently");
if (dx.numel() == 0) {
CNLOG(INFO) << "skip zero-element tensor";
return;
}
cnrtFunctionType_t k_type = CNRT_FUNC_TYPE_UNION1;
cnrtDim3_t k_dim;
PartitionSeg partition_info;
policyFunc(&k_dim, &k_type, &partition_info, num_, h_feature);
int n_limit_seg, h_limit_seg, w_limit_seg;
bool ret =
findLimit(partition_info.n_per_core, partition_info.h_per_core, w_feature,
dx_c, dy_c, &n_limit_seg, &h_limit_seg, &w_limit_seg, psa_type);
if (ret != true) {
return;
}
auto memory_format =
torch_mlu::cnnl::ops::get_channels_last_memory_format(dy.dim());
......@@ -252,8 +55,11 @@ void PSAMaskBackwardMLUKernelLauncher(const int psa_type, const Tensor dy,
at::Tensor dx_tmp = at::empty({num_, dx_c, h_feature, w_feature},
dy.options(), memory_format);
// get compute queue
auto queue = torch_mlu::getCurQueue();
MluOpTensorDescriptor dy_desc, dx_tmp_desc;
dy_desc.set_with_layout(dy_tensor, MLUOP_LAYOUT_NHWC);
dx_tmp_desc.set_with_layout(dx_tmp, MLUOP_LAYOUT_NHWC);
auto handle = mluOpGetCurrentHandle();
// get ptr of tensors
auto dx_impl = torch_mlu::getMluTensorImpl(dx_tmp);
......@@ -261,13 +67,9 @@ void PSAMaskBackwardMLUKernelLauncher(const int psa_type, const Tensor dy,
auto dy_impl = torch_mlu::getMluTensorImpl(dy_tensor);
auto dy_ptr = dy_impl->cnnlMalloc();
KernelPsamaskBackward(
k_dim, k_type, queue, dy_ptr, dx_ptr, (PsamaskType)psa_type,
partition_info.core_partition, partition_info.cluster_partition, num_,
h_feature, w_feature, h_mask, w_mask, dx_c, dy_c, half_h_mask,
half_w_mask, partition_info.n_per_core, partition_info.h_per_core,
partition_info.n_per_cluster, partition_info.h_per_cluster, n_limit_seg,
h_limit_seg, w_limit_seg);
TORCH_MLUOP_CHECK(mluOpPsamaskBackward(handle, psa_type, dy_desc.desc(),
dy_ptr, h_mask, w_mask,
dx_tmp_desc.desc(), dx_ptr));
dx.copy_(dx_tmp);
}
......
......@@ -9,26 +9,7 @@
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "pytorch_device_registry.hpp"
#include "pytorch_mlu_helper.hpp"
void KernelRoiAlign(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
cnrtQueue_t queue, const cnrtDataType_t d_type,
const void *input, const void *rois, const int channels,
const bool aligned, const int pooled_height,
const int pooled_width, const int input_height,
const int input_width, const int sampling_ratio,
const float spatial_scale, const int num_rois,
void *output);
void KernelRoiAlignBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
cnrtQueue_t queue, const cnrtDataType_t dtype,
const void *grads, const void *boxes,
void *grads_image, const int boxes_num,
const int hi, const int wi, const int c,
const int no, const int ho, const int wo,
const float spatial_scale, const int sampling_ratio,
const bool aligned);
#include "mlu_common_helper.h"
void ROIAlignForwardMLUKernelLauncher(Tensor input, Tensor rois, Tensor output,
Tensor argmax_y, Tensor argmax_x,
......@@ -36,17 +17,7 @@ void ROIAlignForwardMLUKernelLauncher(Tensor input, Tensor rois, Tensor output,
float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned) {
// params check
TORCH_CHECK(
input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf,
"input type should be Float or Half, got ", input.scalar_type());
TORCH_CHECK(rois.scalar_type() == input.scalar_type(),
"rois should have the same type as input");
TORCH_CHECK(input.dim() == 4, "input should be a 4d tensor, got ",
input.dim(), "D");
TORCH_CHECK(rois.dim() == 2, "rois should be a 2d tensor, got ", rois.dim(),
"D");
TORCH_CHECK(pool_mode == 1, "pool_mode only supports 'avg' currently");
auto memory_format =
torch_mlu::cnnl::ops::get_channels_last_memory_format(input.dim());
auto input_tensor =
......@@ -57,52 +28,56 @@ void ROIAlignForwardMLUKernelLauncher(Tensor input, Tensor rois, Tensor output,
int height = input.size(2);
int width = input.size(3);
if (output.numel() == 0) {
output = at::zeros({num_rois, channels, aligned_height, aligned_width},
input.options());
return;
}
at::Tensor output_tmp =
auto output_contiguous =
at::empty({num_rois, channels, aligned_height, aligned_width},
input.options(), memory_format);
// get tensor impl
auto self_impl = torch_mlu::getMluTensorImpl(input_tensor);
auto rois_impl = torch_mlu::getMluTensorImpl(rois);
auto output_impl = torch_mlu::getMluTensorImpl(output_tmp);
auto output_impl = torch_mlu::getMluTensorImpl(output_contiguous);
// get compute queue
auto queue = torch_mlu::getCurQueue();
MluOpTensorDescriptor input_desc, rois_desc, argmax_y_desc, argmax_x_desc,
output_desc;
input_desc.set_with_layout(input_tensor, MLUOP_LAYOUT_NHWC);
rois_desc.set_with_layout(rois, MLUOP_LAYOUT_ARRAY);
output_desc.set_with_layout(output_contiguous, MLUOP_LAYOUT_NHWC);
// get the mlu ptr
auto self_ptr = self_impl->cnnlMalloc();
auto rois_ptr = rois_impl->cnnlMalloc();
auto output_ptr = output_impl->cnnlMalloc();
cnrtJobType_t k_type = CNRT_FUNC_TYPE_UNION1;
cnrtDim3_t k_dim;
k_dim.x = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
k_dim.y = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
k_dim.z = 1;
cnrtDataType_t data_type = torch_mlu::toCnrtDtype(input.dtype());
KernelRoiAlign(k_dim, k_type, queue, data_type, self_ptr, rois_ptr, channels,
aligned, aligned_height, aligned_width, height, width,
sampling_ratio, spatial_scale, num_rois, output_ptr);
output.copy_(output_tmp);
}
static int nearestPower2(int x) {
x--;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
x++;
return x;
mluOpRoiAlignForwardDescriptor_t roialign_desc;
TORCH_MLUOP_CHECK(mluOpCreateRoiAlignForwardDescriptor(&roialign_desc));
TORCH_MLUOP_CHECK(mluOpSetRoiAlignForwardDescriptor_v2(
roialign_desc, aligned_height, aligned_width, sampling_ratio,
spatial_scale, pool_mode, aligned));
auto handle = mluOpGetCurrentHandle();
if (pool_mode == 0) {
auto argmax_y_contiguous =
torch_mlu::cnnl::ops::cnnl_contiguous(argmax_y, memory_format);
auto argmax_x_contiguous =
torch_mlu::cnnl::ops::cnnl_contiguous(argmax_x, memory_format);
auto argmax_x_impl = torch_mlu::getMluTensorImpl(argmax_x_contiguous);
auto argmax_y_impl = torch_mlu::getMluTensorImpl(argmax_y_contiguous);
auto argmax_x_ptr = argmax_x_impl->cnnlMalloc();
auto argmax_y_ptr = argmax_y_impl->cnnlMalloc();
argmax_y_desc.set_with_layout(argmax_x_contiguous, MLUOP_LAYOUT_NHWC);
argmax_x_desc.set_with_layout(argmax_x_contiguous, MLUOP_LAYOUT_NHWC);
TORCH_MLUOP_CHECK(mluOpRoiAlignForward_v2(
handle, roialign_desc, input_desc.desc(), self_ptr, rois_desc.desc(),
rois_ptr, output_desc.desc(), output_ptr, argmax_x_desc.desc(),
argmax_x_ptr, argmax_y_desc.desc(), argmax_y_ptr));
argmax_x.copy_(argmax_x_contiguous);
argmax_y.copy_(argmax_y_contiguous);
} else {
TORCH_MLUOP_CHECK(mluOpRoiAlignForward_v2(
handle, roialign_desc, input_desc.desc(), self_ptr, rois_desc.desc(),
rois_ptr, output_desc.desc(), output_ptr, NULL, NULL, NULL, NULL));
}
TORCH_MLUOP_CHECK(mluOpDestroyRoiAlignForwardDescriptor(roialign_desc));
output.copy_(output_contiguous);
}
void ROIAlignBackwardMLUKernelLauncher(Tensor grad, Tensor rois,
......@@ -112,17 +87,7 @@ void ROIAlignBackwardMLUKernelLauncher(Tensor grad, Tensor rois,
int sampling_ratio, int pool_mode,
bool aligned) {
// params check
TORCH_CHECK(
grad.scalar_type() == at::kFloat || grad.scalar_type() == at::kHalf,
"grad type should be Float or Half, got ", grad.scalar_type());
TORCH_CHECK(rois.scalar_type() == grad.scalar_type(),
"rois should have the same type as grad");
TORCH_CHECK(grad.dim() == 4, "grad should be a 4d tensor, got ", grad.dim(),
"D");
TORCH_CHECK(rois.dim() == 2, "rois should be a 2d tensor, got ", rois.dim(),
"D");
TORCH_CHECK(pool_mode == 1, "pool_mode only supports 'avg' currently");
int batch_size = grad_input.size(0);
int channels = grad_input.size(1);
int height = grad_input.size(2);
......@@ -148,26 +113,40 @@ void ROIAlignBackwardMLUKernelLauncher(Tensor grad, Tensor rois,
auto grad_input_impl = torch_mlu::getMluTensorImpl(grad_input_);
auto rois_impl = torch_mlu::getMluTensorImpl(rois);
// get compute queue
auto queue = torch_mlu::getCurQueue();
// get the mlu ptr
auto grad_ptr = grad_impl->cnnlMalloc();
auto rois_ptr = rois_impl->cnnlMalloc();
auto grad_input_ptr = grad_input_impl->cnnlMalloc();
cnrtJobType_t k_type = CNRT_FUNC_TYPE_UNION1;
int need_core = nearestPower2(boxes_num);
int union_number = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
uint32_t dim_x = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
uint32_t dim_y = (need_core - 1) / dim_x + 1;
dim_y = (dim_y > union_number) ? union_number : dim_y;
cnrtDim3_t k_dim = {dim_x, dim_y, 1};
cnrtDataType_t k_dtype = torch_mlu::toCnrtDtype(grad.dtype());
KernelRoiAlignBackward(k_dim, k_type, queue, k_dtype, grad_ptr, rois_ptr,
grad_input_ptr, boxes_num, hi, wi, c, no, ho, wo,
spatial_scale, sampling_ratio, aligned);
MluOpTensorDescriptor grads_desc, rois_desc, argmax_y_desc, argmax_x_desc,
grad_input_desc;
grads_desc.set_with_layout(grad_, MLUOP_LAYOUT_NHWC);
rois_desc.set_with_layout(rois, MLUOP_LAYOUT_ARRAY);
grad_input_desc.set_with_layout(grad_input_, MLUOP_LAYOUT_NHWC);
auto handle = mluOpGetCurrentHandle();
if (pool_mode == 0) {
auto argmax_y_contiguous =
torch_mlu::cnnl::ops::cnnl_contiguous(argmax_y, memory_format);
auto argmax_x_contiguous =
torch_mlu::cnnl::ops::cnnl_contiguous(argmax_x, memory_format);
auto argmax_x_impl = torch_mlu::getMluTensorImpl(argmax_x_contiguous);
auto argmax_y_impl = torch_mlu::getMluTensorImpl(argmax_y_contiguous);
auto argmax_x_ptr = argmax_x_impl->cnnlMalloc();
auto argmax_y_ptr = argmax_y_impl->cnnlMalloc();
argmax_y_desc.set_with_layout(argmax_x_contiguous, MLUOP_LAYOUT_NHWC);
argmax_x_desc.set_with_layout(argmax_x_contiguous, MLUOP_LAYOUT_NHWC);
TORCH_MLUOP_CHECK(mluOpRoiAlignBackward_v2(
handle, grads_desc.desc(), grad_ptr, rois_desc.desc(), rois_ptr,
argmax_y_desc.desc(), argmax_x_ptr, argmax_y_desc.desc(), argmax_y_ptr,
spatial_scale, sampling_ratio, aligned, pool_mode,
grad_input_desc.desc(), grad_input_ptr));
} else {
TORCH_MLUOP_CHECK(mluOpRoiAlignBackward_v2(
handle, grads_desc.desc(), grad_ptr, rois_desc.desc(), rois_ptr, NULL,
NULL, NULL, NULL, spatial_scale, sampling_ratio, aligned, pool_mode,
grad_input_desc.desc(), grad_input_ptr));
}
grad_input.copy_(grad_input_);
}
......
......@@ -9,37 +9,7 @@
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "pytorch_device_registry.hpp"
#include "pytorch_mlu_helper.hpp"
#include "roi_align_rotated_utils.hpp"
namespace {
void policyFunc(int bin_num, cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type) {
unsigned int core_num = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
unsigned int cluster_num = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
*k_type = CNRT_FUNC_TYPE_UNION1;
k_dim->x = core_num;
unsigned int use_cluster = (bin_num + core_num - 1) / core_num;
k_dim->y = use_cluster > cluster_num ? cluster_num : use_cluster;
k_dim->z = 1;
}
} // namespace
void KernelRoiAlignRotatedForward(
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
const cnrtDataType_t d_type, const void *features, const void *rois,
void *output, const int batch, const int height, const int width,
const int channel, const int rois_num,
const RoiAlignRotatedParams roiAlignRotatedParams);
void KernelRoiAlignRotatedBackward(
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
const cnrtDataType_t d_type, const void *top_grad, const void *rois,
void *bottom_grad, const int batch, const int height, const int width,
const int channel, const int rois_num,
const RoiAlignRotatedParams roiAlignRotatedParams);
#include "mlu_common_helper.h"
void ROIAlignRotatedForwardMLUKernelLauncher(Tensor input, Tensor rois,
Tensor output, int pooled_height,
......@@ -47,153 +17,70 @@ void ROIAlignRotatedForwardMLUKernelLauncher(Tensor input, Tensor rois,
float spatial_scale,
int sampling_ratio, bool aligned,
bool clockwise) {
TORCH_CHECK(((input.scalar_type() == output.scalar_type()) &&
(output.scalar_type() == rois.scalar_type())),
"data types of input, rois and output should be the same, ",
"but now input type is ", input.scalar_type(), ", rois type is ",
rois.scalar_type(), ", output type is ", output.scalar_type(),
".");
TORCH_CHECK(
(input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf),
"input type should be Float or Half, got ", input.scalar_type(), ".");
TORCH_CHECK(input.dim() == 4, "input should be a 4d tensor, got ",
input.dim(), "D.");
TORCH_CHECK(rois.dim() == 2, "rois should be a 2d tensor, got ", rois.dim(),
"D.");
TORCH_CHECK(output.dim() == 4, "output should be a 4d tensor, got ",
output.dim(), "D.");
TORCH_CHECK((rois.size(0) == output.size(0)),
"the 1st dimensions of rois and output should be the same, ",
"but now the 1st dimension of rois is ", rois.size(0),
", and output is ", output.size(0), ".");
TORCH_CHECK((input.size(1) == output.size(1)),
"the 2nd dimensions of input and output should be the same, ",
"but now the 2nd dimension of input is ", input.size(1),
", and output is ", output.size(1), ".");
int channel = input.size(1);
int width = input.size(3);
int height = input.size(2);
int batch = input.size(0);
int rois_nums = rois.size(0);
cnrtDataType_t d_type = torch_mlu::toCnrtDtype(input.dtype());
// return if zero-elements
if (input.numel() == 0) {
CNLOG(INFO) << "Skip the zero-elements case.";
return;
}
RoiAlignRotatedParams roiAlignRotatedParams{pooled_height, pooled_width,
sampling_ratio, spatial_scale,
aligned, clockwise};
cnrtDim3_t k_dim;
cnrtFunctionType_t k_type;
policyFunc(rois_nums * pooled_height * pooled_width, &k_dim, &k_type);
auto memory_format =
torch_mlu::cnnl::ops::get_channels_last_memory_format(input.dim());
auto input_tensor =
torch_mlu::cnnl::ops::cnnl_contiguous(input, memory_format);
at::Tensor output_tmp =
at::empty({rois_nums, channel, pooled_height, pooled_width},
input.options(), memory_format);
auto input_ = torch_mlu::cnnl::ops::cnnl_contiguous(input, memory_format);
auto rois_contiguous =
torch_mlu::cnnl::ops::cnnl_contiguous(rois, rois.suggest_memory_format());
auto output_contiguous =
torch_mlu::cnnl::ops::cnnl_contiguous(output, memory_format);
// get compute queue
auto queue = torch_mlu::getCurQueue();
MluOpTensorDescriptor input_desc, rois_desc, output_desc;
input_desc.set_with_layout(input_, MLUOP_LAYOUT_NHWC);
rois_desc.set(rois_contiguous);
output_desc.set_with_layout(output_contiguous, MLUOP_LAYOUT_NHWC);
// get ptr of tensors
auto input_impl = torch_mlu::getMluTensorImpl(input_tensor);
auto input_impl = torch_mlu::getMluTensorImpl(input_);
auto input_ptr = input_impl->cnnlMalloc();
auto rois_impl = torch_mlu::getMluTensorImpl(rois);
auto rois_impl = torch_mlu::getMluTensorImpl(rois_contiguous);
auto rois_ptr = rois_impl->cnnlMalloc();
auto output_impl = torch_mlu::getMluTensorImpl(output_tmp);
auto output_impl = torch_mlu::getMluTensorImpl(output_contiguous);
auto output_ptr = output_impl->cnnlMalloc();
KernelRoiAlignRotatedForward(k_dim, k_type, queue, d_type, input_ptr,
rois_ptr, output_ptr, batch, height, width,
channel, rois_nums, roiAlignRotatedParams);
output.copy_(output_tmp);
// get compute handle
auto handle = mluOpGetCurrentHandle();
TORCH_MLUOP_CHECK(mluOpRoiAlignRotatedForward(
handle, input_desc.desc(), input_ptr, rois_desc.desc(), rois_ptr,
pooled_height, pooled_width, sampling_ratio, spatial_scale, aligned,
clockwise, output_desc.desc(), output_ptr));
output.copy_(output_contiguous);
}
void ROIAlignRotatedBackwardMLUKernelLauncher(
Tensor top_grad, Tensor rois, Tensor bottom_grad, int pooled_height,
int pooled_width, float spatial_scale, int sampling_ratio, bool aligned,
bool clockwise) {
TORCH_CHECK(((top_grad.scalar_type() == bottom_grad.scalar_type()) &&
(bottom_grad.scalar_type() == rois.scalar_type())),
"data types of top_grad, rois and bottom_grad should be ",
"the same, but now top_grad type is ", top_grad.scalar_type(),
", rois type is ", rois.scalar_type(), ", bottom_grad type is ",
bottom_grad.scalar_type(), ".");
TORCH_CHECK((bottom_grad.scalar_type() == at::kFloat ||
bottom_grad.scalar_type() == at::kHalf),
"Data type of bottom_grad should be Float ro Half, got ",
bottom_grad.scalar_type(), ".");
TORCH_CHECK(bottom_grad.dim() == 4, "bottom_grad should be a 4d tensor, got ",
top_grad.dim(), "D.");
TORCH_CHECK(rois.dim() == 2, "rois should be a 2d tensor, got ", rois.dim(),
"D.");
TORCH_CHECK(top_grad.dim() == 4, "top_grad should be a 4d tensor, got ",
bottom_grad.dim(), "D.");
TORCH_CHECK((rois.size(0) == top_grad.size(0)),
"the 1st dimensions of rois and top_grad should be the same, ",
"but now the 1st dimension of rois is ", rois.size(0),
", and top_grad is ", top_grad.size(0), ".");
TORCH_CHECK((bottom_grad.size(1) == top_grad.size(1)),
"the 2nd dimensions of bottom_grad and top_grad should be ",
"the same, but now the 2nd dimension of bottom_grad is ",
bottom_grad.size(1), ", and top_grad is ", top_grad.size(1), ".");
int channel = bottom_grad.size(1);
int width = bottom_grad.size(3);
int height = bottom_grad.size(2);
int batch = bottom_grad.size(0);
int rois_nums = rois.size(0);
cnrtDataType_t d_type = torch_mlu::toCnrtDtype(bottom_grad.dtype());
// return if zero-elements
if (bottom_grad.numel() == 0) {
CNLOG(INFO) << "Skip the zero-elements case.";
return;
}
RoiAlignRotatedParams roiAlignRotatedParams{pooled_height, pooled_width,
sampling_ratio, spatial_scale,
aligned, clockwise};
cnrtDim3_t k_dim;
cnrtFunctionType_t k_type;
policyFunc(rois_nums * pooled_height * pooled_width, &k_dim, &k_type);
auto memory_format =
torch_mlu::cnnl::ops::get_channels_last_memory_format(top_grad.dim());
auto top_grad_tensor =
auto top_grad_ =
torch_mlu::cnnl::ops::cnnl_contiguous(top_grad, memory_format);
at::Tensor bottom_grad_tmp = at::empty({batch, channel, height, width},
top_grad.options(), memory_format)
.zero_();
// get compute queue
auto queue = torch_mlu::getCurQueue();
auto rois_contiguous =
torch_mlu::cnnl::ops::cnnl_contiguous(rois, rois.suggest_memory_format());
auto bottom_grad_ =
torch_mlu::cnnl::ops::cnnl_contiguous(bottom_grad, memory_format);
// get ptr of tensors
auto bottom_grad_impl = torch_mlu::getMluTensorImpl(bottom_grad_tmp);
auto bottom_grad_ptr = bottom_grad_impl->cnnlMalloc();
auto rois_impl = torch_mlu::getMluTensorImpl(rois);
auto rois_ptr = rois_impl->cnnlMalloc();
auto top_grad_impl = torch_mlu::getMluTensorImpl(top_grad_tensor);
auto top_grad_impl = torch_mlu::getMluTensorImpl(top_grad_);
auto top_grad_ptr = top_grad_impl->cnnlMalloc();
auto rois_impl = torch_mlu::getMluTensorImpl(rois_contiguous);
auto rois_ptr = rois_impl->cnnlMalloc();
auto bottom_grad_impl = torch_mlu::getMluTensorImpl(bottom_grad_);
auto bottom_grad_ptr = bottom_grad_impl->cnnlMalloc();
KernelRoiAlignRotatedBackward(k_dim, k_type, queue, d_type, top_grad_ptr,
rois_ptr, bottom_grad_ptr, batch, height, width,
channel, rois_nums, roiAlignRotatedParams);
bottom_grad.copy_(bottom_grad_tmp);
MluOpTensorDescriptor top_grad_desc, rois_desc, bottom_grad_desc;
top_grad_desc.set_with_layout(top_grad_, MLUOP_LAYOUT_NHWC);
rois_desc.set(rois_contiguous);
bottom_grad_desc.set_with_layout(bottom_grad_, MLUOP_LAYOUT_NHWC);
// get compute handle
auto handle = mluOpGetCurrentHandle();
TORCH_MLUOP_CHECK(mluOpRoiAlignRotatedBackward(
handle, top_grad_desc.desc(), top_grad_ptr, rois_desc.desc(), rois_ptr,
pooled_height, pooled_width, sampling_ratio, spatial_scale, aligned,
clockwise, bottom_grad_desc.desc(), bottom_grad_ptr));
bottom_grad.copy_(bottom_grad_);
}
void roi_align_rotated_forward_mlu(Tensor input, Tensor rois, Tensor output,
......
......@@ -9,49 +9,7 @@
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "pytorch_device_registry.hpp"
#include "pytorch_mlu_helper.hpp"
void KernelPtsIdxOfVoxels(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
cnrtQueue_t queue, const cnrtDataType_t d_type,
const int pool_method, const int boxes_num,
const int pts_num, const int max_pts_each_voxel,
const int out_x, const int out_y, const int out_z,
const void *rois, const void *pts,
int *pts_idx_of_voxels);
void KernelRoiawarePool3dForward(
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
const cnrtDataType_t d_type, const int pool_method, const int boxes_num,
const int pts_num, const int channels, const int max_pts_each_voxel,
const int out_x, const int out_y, const int out_z, const void *pts_feature,
const int *pts_idx_of_voxels, void *pooled_features, int *argmax);
// policy function
static void kernelPtsIdxOfVoxelsPolicyFunc(const int boxes_num,
cnrtDim3_t *k_dim,
cnrtFunctionType_t *k_type) {
unsigned int core_num = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
unsigned int cluster_num = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
*k_type = CNRT_FUNC_TYPE_UNION1;
k_dim->x = core_num;
unsigned int use_cluster = (boxes_num + core_num - 1) / core_num;
k_dim->y = use_cluster > cluster_num ? cluster_num : use_cluster;
k_dim->z = 1;
}
static void kernelRoiawarePool3dForwardPolicyFunc(
const int boxes_num, const int out_x, const int out_y, const int out_z,
cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type) {
unsigned int core_num = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
unsigned int cluster_num = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
*k_type = CNRT_FUNC_TYPE_UNION1;
k_dim->x = core_num;
const int voxels_num = boxes_num * out_x * out_y * out_z;
unsigned int use_cluster = (voxels_num + core_num - 1) / core_num;
k_dim->y = use_cluster > cluster_num ? cluster_num : use_cluster;
k_dim->z = 1;
}
#include "mlu_common_helper.h"
void RoiawarePool3dForwardMLUKernelLauncher(
const int pool_method, const int boxes_num, const int pts_num,
......@@ -59,168 +17,65 @@ void RoiawarePool3dForwardMLUKernelLauncher(
const int out_y, const int out_z, const Tensor rois, const Tensor pts,
const Tensor pts_feature, Tensor pts_idx_of_voxels, Tensor pooled_features,
Tensor argmax) {
// check datatype
TORCH_CHECK(((pts.scalar_type() == rois.scalar_type()) &&
(pts_feature.scalar_type() == rois.scalar_type()) &&
(pooled_features.scalar_type() == rois.scalar_type())),
"data types of rois, rois, pts_feature and pooled_features "
"should be the same, ",
"but now rois type is ", rois.scalar_type(), ", pts type is ",
pts.scalar_type(), ", pts_feature type is ",
pts_feature.scalar_type(), ", pooled_features type is ",
pooled_features.scalar_type(), ".");
TORCH_CHECK(
(rois.scalar_type() == at::kFloat || rois.scalar_type() == at::kHalf),
"rois type should be Float or Half, got ", rois.scalar_type(), ".");
TORCH_CHECK((pts_idx_of_voxels.scalar_type() == at::kInt),
"pts_idx_of_voxels type should be Int, got ",
pts_idx_of_voxels.scalar_type(), ".");
// check dim
TORCH_CHECK(rois.dim() == 2, "rois should be a 2D tensor, got ", rois.dim(),
"D.");
TORCH_CHECK(pts.dim() == 2, "pts should be a 2D tensor, got ", pts.dim(),
"D.");
TORCH_CHECK(pts_feature.dim() == 2, "pts_feature should be a 2D tensor, got ",
pts_feature.dim(), "D.");
TORCH_CHECK(pts_idx_of_voxels.dim() == 5,
"pts_idx_of_voxels should be a 5D tensor, got ",
pts_idx_of_voxels.dim(), "D.");
TORCH_CHECK(pooled_features.dim() == 5,
"pooled_features should be a 5D tensor, got ",
pooled_features.dim(), "D.");
// check shape
TORCH_CHECK(((rois.size(0) == boxes_num) && (rois.size(1) == 7)),
"the dimensions of rois should be (boxes_num, 7), ", "but got (",
rois.size(0), ", ", rois.size(1), ") .");
TORCH_CHECK(((pts.size(0) == pts_num) && (pts.size(1) == 3)),
"the dimensions of pts should be (pts_num, 3), ", "but got (",
pts.size(0), ",", pts.size(1), ").");
TORCH_CHECK(
((pts_feature.size(0) == pts_num) && (pts_feature.size(1) == channels)),
"the dimensions of pts_feature should be (pts_num, channels), ",
"but got (", pts_feature.size(0), ",", pts_feature.size(1), ").");
TORCH_CHECK(((pts_idx_of_voxels.size(0) == boxes_num) &&
(pts_idx_of_voxels.size(1) == out_x) &&
(pts_idx_of_voxels.size(2) == out_y) &&
(pts_idx_of_voxels.size(3) == out_z) &&
(pts_idx_of_voxels.size(4) == max_pts_each_voxel)),
"the dimensions of pts_idx_of_voxels should be (boxes_num, "
"out_x, out_y, out_z, max_pts_each_voxel), ",
"but got (", pts_idx_of_voxels.size(0), ",",
pts_idx_of_voxels.size(1), ",", pts_idx_of_voxels.size(2), ",",
pts_idx_of_voxels.size(3), ",", pts_idx_of_voxels.size(4), ").");
TORCH_CHECK(((pooled_features.size(0) == boxes_num) &&
(pooled_features.size(1) == out_x) &&
(pooled_features.size(2) == out_y) &&
(pooled_features.size(3) == out_z) &&
(pooled_features.size(4) == channels)),
"the dimensions of pooled_features should be (boxes_num, out_x, "
"out_y, out_z, channels), ",
"but got (", pooled_features.size(0), ",",
pooled_features.size(1), ",", pooled_features.size(2), ",",
pooled_features.size(3), ",", pooled_features.size(4), ").");
// check other params : pool_mothod
TORCH_CHECK(((pool_method == 0) || (pool_method == 1)),
"the num of pool_method should be 0(max) or 1(avg), ", "but got ",
pool_method, ".");
// check large tensor
const size_t max_input_size = 2147483648;
TORCH_CHECK(rois.numel() < max_input_size,
"rois element num should be less than 2^31, got ", rois.numel(),
".");
TORCH_CHECK(pts.numel() < max_input_size,
"pts element num should be less than 2^31, got ", pts.numel(),
".");
TORCH_CHECK(pts_feature.numel() < max_input_size,
"pts_feature element num should be less than 2^31, got ",
pts_feature.numel(), ".");
TORCH_CHECK(pts_idx_of_voxels.numel() < max_input_size,
"pts_idx_of_voxels element num should be less than 2^31, got ",
pts_idx_of_voxels.numel(), ".");
TORCH_CHECK(pooled_features.numel() < max_input_size,
"pooled_features element num should be less than 2^31, got ",
pooled_features.numel(), ".");
// check zero element
TORCH_CHECK(rois.numel() != 0, "rois.numel() should not be zero, got ",
rois.numel());
TORCH_CHECK(pts.numel() != 0, "pts.numel() should not be zero, got ",
pts.numel());
TORCH_CHECK(pts_feature.numel() != 0,
"pts_feature.numel() should not be zero, got ",
pts_feature.numel());
TORCH_CHECK(pts_idx_of_voxels.numel() != 0,
"pts_idx_of_voxels.numel() should not be zero, got ",
pts_idx_of_voxels.numel());
TORCH_CHECK(pooled_features.numel() != 0,
"pooled_features.numel() should not be zero, got ",
pooled_features.numel());
if (pool_method == 0) {
// check datatype
TORCH_CHECK((argmax.scalar_type() == at::kInt),
"argmax type should be Int, got ", argmax.scalar_type(), ".");
// check dim
TORCH_CHECK(argmax.dim() == 5, "argmax should be a 5D tensor, got ",
argmax.dim(), "D.");
// check shape
TORCH_CHECK(((argmax.size(0) == boxes_num) && (argmax.size(1) == out_x) &&
(argmax.size(2) == out_y) && (argmax.size(3) == out_z) &&
(argmax.size(4) == channels)),
"the dimensions of argmax should be (boxes_num, out_x, out_y, "
"out_z, channels), ",
"but got (", argmax.size(0), ",", argmax.size(1), ",",
argmax.size(2), ",", argmax.size(3), ",", argmax.size(4), ").");
// check large tensor
TORCH_CHECK(argmax.numel() < max_input_size,
"argmax element num should be less than 2^31, got ",
argmax.numel(), ".");
// check zero element
TORCH_CHECK(argmax.numel() != 0, "argmax.numel() should not be zero, got ",
argmax.numel());
// when pool_method is 0, which is max pool, init argmax data value to -1
argmax.fill_(static_cast<int>(-1));
}
// calculate task one dimension
cnrtDim3_t k1_dim;
cnrtFunctionType_t k1_type;
kernelPtsIdxOfVoxelsPolicyFunc(boxes_num, &k1_dim, &k1_type);
cnrtDim3_t k2_dim;
cnrtFunctionType_t k2_type;
kernelRoiawarePool3dForwardPolicyFunc(boxes_num, out_x, out_y, out_z, &k2_dim,
&k2_type);
// get compute queue
auto queue = torch_mlu::getCurQueue();
// get ptr of tensors
auto rois_impl = torch_mlu::getMluTensorImpl(rois);
// get compute handle
auto handle = mluOpGetCurrentHandle();
auto rois_contiguous =
torch_mlu::cnnl::ops::cnnl_contiguous(rois, rois.suggest_memory_format());
auto pts_contiguous =
torch_mlu::cnnl::ops::cnnl_contiguous(pts, pts.suggest_memory_format());
auto pts_feature_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
pts_feature, pts_feature.suggest_memory_format());
auto argmax_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
argmax, argmax.suggest_memory_format());
auto pts_idx_of_voxels_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
pts_idx_of_voxels, pts_idx_of_voxels.suggest_memory_format());
auto pooled_features_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
pooled_features, pooled_features.suggest_memory_format());
MluOpTensorDescriptor rois_desc, pts_desc, pts_feature_desc, argmax_desc,
pts_idx_of_voxels_desc, pooled_features_desc;
rois_desc.set(rois_contiguous);
pts_desc.set(pts_contiguous);
pts_feature_desc.set(pts_feature_contiguous);
argmax_desc.set(argmax_contiguous);
pts_idx_of_voxels_desc.set(pts_idx_of_voxels_contiguous);
pooled_features_desc.set(pooled_features_contiguous);
// allocate extra space for workspace
size_t workspace_size = 0;
TORCH_MLUOP_CHECK(mluOpGetRoiawarePool3dForwardWorkspaceSize(
handle, rois_desc.desc(), pts_desc.desc(), pts_feature_desc.desc(),
&workspace_size));
auto workspace = at::empty(workspace_size, rois.options().dtype(at::kByte));
auto workspace_impl = torch_mlu::getMluTensorImpl(workspace);
auto workspace_ptr = workspace_impl->cnnlMalloc();
auto rois_impl = torch_mlu::getMluTensorImpl(rois_contiguous);
auto pts_impl = torch_mlu::getMluTensorImpl(pts_contiguous);
auto pts_feature_impl = torch_mlu::getMluTensorImpl(pts_feature_contiguous);
auto argmax_impl = torch_mlu::getMluTensorImpl(argmax_contiguous);
auto pts_idx_of_voxels_impl =
torch_mlu::getMluTensorImpl(pts_idx_of_voxels_contiguous);
auto pooled_features_impl =
torch_mlu::getMluTensorImpl(pooled_features_contiguous);
auto rois_ptr = rois_impl->cnnlMalloc();
// transpose points [pts_num, 3] -> [3, pts_num]
auto pts_ = pts.permute({1, 0}).contiguous();
auto pts_impl = torch_mlu::getMluTensorImpl(pts_);
auto pts_ptr = pts_impl->cnnlMalloc();
// transpose points_features [pts_num, channels] -> [channels, pts_num]
auto pts_feature_ = pts_feature.permute({1, 0}).contiguous();
auto pts_feature_impl = torch_mlu::getMluTensorImpl(pts_feature_);
auto pts_feature_ptr = pts_feature_impl->cnnlMalloc();
auto pts_idx_of_voxels_impl = torch_mlu::getMluTensorImpl(pts_idx_of_voxels);
auto argmax_ptr = argmax_impl->cnnlMalloc();
auto pts_idx_of_voxels_ptr = pts_idx_of_voxels_impl->cnnlMalloc();
auto pooled_features_impl = torch_mlu::getMluTensorImpl(pooled_features);
auto pooled_features_ptr = pooled_features_impl->cnnlMalloc();
auto argmax_impl = torch_mlu::getMluTensorImpl(argmax);
auto argmax_ptr = argmax_impl->cnnlMalloc();
// get compute dtype of input
cnrtDataType_t data_type = torch_mlu::toCnrtDtype(rois.dtype());
// launch kernel PtsIdxOfVoxels
CNLOG(INFO) << "Launch Kernel MLUKernel PtsIdxOfVoxels<<<" << k1_dim.x << ", "
<< k1_dim.y << ", " << k1_dim.z << ">>>";
KernelPtsIdxOfVoxels(k1_dim, k1_type, queue, data_type, pool_method,
boxes_num, pts_num, max_pts_each_voxel, out_x, out_y,
out_z, rois_ptr, pts_ptr, (int *)pts_idx_of_voxels_ptr);
// launch kernel RoiawarePool3dForward
CNLOG(INFO) << "Launch Kernel MLUKernel RoiawarePool3dForward<<<" << k2_dim.x
<< ", " << k2_dim.y << ", " << k2_dim.z << ">>>";
KernelRoiawarePool3dForward(
k2_dim, k2_type, queue, data_type, pool_method, boxes_num, pts_num,
channels, max_pts_each_voxel, out_x, out_y, out_z, pts_feature_ptr,
(int *)pts_idx_of_voxels_ptr, pooled_features_ptr, (int *)argmax_ptr);
CNLOG(INFO) << "Call mluOpRoiawarePool3dForward().";
TORCH_MLUOP_CHECK(mluOpRoiawarePool3dForward(
handle, pool_method, boxes_num, pts_num, channels, rois_desc.desc(),
rois_ptr, pts_desc.desc(), pts_ptr, pts_feature_desc.desc(),
pts_feature_ptr, workspace_ptr, workspace_size, max_pts_each_voxel, out_x,
out_y, out_z, argmax_desc.desc(), argmax_ptr,
pts_idx_of_voxels_desc.desc(), pts_idx_of_voxels_ptr,
pooled_features_desc.desc(), pooled_features_ptr));
}
void roiaware_pool3d_forward_mlu(int boxes_num, int pts_num, int channels,
......@@ -245,136 +100,46 @@ void roiaware_pool3d_forward_impl(int boxes_num, int pts_num, int channels,
REGISTER_DEVICE_IMPL(roiaware_pool3d_forward_impl, MLU,
roiaware_pool3d_forward_mlu);
void KernelRoiawarePool3dBackward(
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
const cnrtDataType_t d_type, const int pool_method, const int boxes_num,
const int out_x, const int out_y, const int out_z, const int channels,
const int max_pts_each_voxel, const int *pts_idx_of_voxels,
const int *argmax, const void *grad_out, void *grad_in);
static void kernelRoiawarePool3dBackwardPolicyFunc(
const int boxes_num, const int out_x, const int out_y, const int out_z,
cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type) {
unsigned int core_num = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
unsigned int cluster_num = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
*k_type = CNRT_FUNC_TYPE_UNION1;
k_dim->x = core_num;
const int voxels_num = boxes_num * out_x * out_y * out_z;
unsigned int use_cluster = (voxels_num + core_num - 1) / core_num;
k_dim->y = use_cluster > cluster_num ? cluster_num : use_cluster;
k_dim->z = 1;
}
void RoiawarePool3dBackwardMLUKernelLauncher(
int pool_method, int boxes_num, int out_x, int out_y, int out_z,
int channels, int max_pts_each_voxel, const Tensor pts_idx_of_voxels,
const Tensor argmax, const Tensor grad_out, Tensor grad_in) {
// check datatype
TORCH_CHECK((pts_idx_of_voxels.scalar_type() == at::kInt),
"pts_idx_of_voxels type should be Int, got ",
pts_idx_of_voxels.scalar_type(), ".");
TORCH_CHECK((argmax.scalar_type() == at::kInt),
"argmax type should be Int, got ", argmax.scalar_type(), ".");
TORCH_CHECK((grad_out.scalar_type() == at::kFloat ||
grad_out.scalar_type() == at::kHalf),
"grad_out type should be Float or Half, got ",
grad_out.scalar_type(), ".");
TORCH_CHECK((grad_out.scalar_type() == grad_in.scalar_type()),
"data types of grad_out, grad_in, should be the same, ",
"but now grad_out type is ", grad_out.scalar_type(),
", grad_in type is ", grad_in.scalar_type(), ".");
// check dim
TORCH_CHECK(pts_idx_of_voxels.dim() == 5,
"pts_idx_of_voxels should be a 5D tensor, got ",
pts_idx_of_voxels.dim(), "D.");
TORCH_CHECK(argmax.dim() == 5, "argmax should be a 5D tensor, got ",
argmax.dim(), "D.");
TORCH_CHECK(grad_out.dim() == 5, "grad_out should be a 5D tensor, got ",
grad_out.dim(), "D.");
TORCH_CHECK(grad_in.dim() == 2, "grad_in should be a 2D tensor, got ",
grad_in.dim(), "D.");
// check shape
TORCH_CHECK(((pts_idx_of_voxels.size(0) == boxes_num) &&
(pts_idx_of_voxels.size(1) == out_x) &&
(pts_idx_of_voxels.size(2) == out_y) &&
(pts_idx_of_voxels.size(3) == out_z) &&
(pts_idx_of_voxels.size(4) == max_pts_each_voxel)),
"the dimensions of pts_idx_of_voxels should be (boxes_num, "
"out_x, out_y, out_z, max_pts_each_voxel), ",
"but got (", pts_idx_of_voxels.size(0), ",",
pts_idx_of_voxels.size(1), ",", pts_idx_of_voxels.size(2), ",",
pts_idx_of_voxels.size(3), ",", pts_idx_of_voxels.size(4), ").");
TORCH_CHECK(((argmax.size(0) == boxes_num) && (argmax.size(1) == out_x) &&
(argmax.size(2) == out_y) && (argmax.size(3) == out_z) &&
(argmax.size(4) == channels)),
"the dimensions of argmax should be (boxes_num, out_x, out_y, "
"out_z, channels), ",
"but got (", argmax.size(0), ",", argmax.size(1), ",",
argmax.size(2), ",", argmax.size(3), ",", argmax.size(4), ").");
TORCH_CHECK(((grad_out.size(0) == boxes_num) && (grad_out.size(1) == out_x) &&
(grad_out.size(2) == out_y) && (grad_out.size(3) == out_z) &&
(grad_out.size(4) == channels)),
"the dimensions of grad_out should be (boxes_num, out_x, "
"out_y, out_z, channels), ",
"but got (", grad_out.size(0), ",", grad_out.size(1), ",",
grad_out.size(2), ",", grad_out.size(3), ",", grad_out.size(4),
").");
TORCH_CHECK((grad_in.size(1) == channels),
"the 1st dimensions of grad_in should be channels, ", "but got ",
grad_in.size(1), ".");
// check other params : pool_mothod
TORCH_CHECK(((pool_method == 0) || (pool_method == 1)),
"the num of pool_method should be 0(max) or 1(avg), ", "but got ",
pool_method, ".");
// check large tensor
const size_t max_input_size = 2147483648;
TORCH_CHECK(pts_idx_of_voxels.numel() < max_input_size,
"pts_idx_of_voxels element num should be less than 2^31, got ",
pts_idx_of_voxels.numel(), ".");
TORCH_CHECK(argmax.numel() < max_input_size,
"argmax element num should be less than 2^31, got ",
argmax.numel(), ".");
TORCH_CHECK(grad_out.numel() < max_input_size,
"grad_out element num should be less than 2^31, got ",
grad_out.numel(), ".");
TORCH_CHECK(grad_in.numel() < max_input_size,
"grad_in element num should be less than 2^31, got ",
grad_in.numel(), ".");
// check zero element
TORCH_CHECK(pts_idx_of_voxels.numel() != 0,
"pts_idx_of_voxels.numel() should not be zero, got ",
pts_idx_of_voxels.numel());
TORCH_CHECK(argmax.numel() != 0, "argmax.numel() should not be zero, got ",
argmax.numel());
TORCH_CHECK(grad_out.numel() != 0,
"grad_out.numel() should not be zero, got ", grad_out.numel());
TORCH_CHECK(grad_in.numel() != 0, "grad_in.numel() should not be zero, got ",
grad_in.numel());
// calculate task one dimension
cnrtDim3_t k_dim;
cnrtFunctionType_t k_type;
kernelRoiawarePool3dBackwardPolicyFunc(boxes_num, out_x, out_y, out_z, &k_dim,
&k_type);
// get compute queue
auto queue = torch_mlu::getCurQueue();
// transpose points_features [pts_num, channels] -> [channels, pts_num]
auto pts_idx_of_voxels_impl = torch_mlu::getMluTensorImpl(pts_idx_of_voxels);
// get compute handle
auto handle = mluOpGetCurrentHandle();
auto pts_idx_of_voxels_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
pts_idx_of_voxels, pts_idx_of_voxels.suggest_memory_format());
auto argmax_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
argmax, argmax.suggest_memory_format());
auto grad_out_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
grad_out, grad_out.suggest_memory_format());
auto grad_in_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
grad_in, grad_in.suggest_memory_format());
MluOpTensorDescriptor pts_idx_of_voxels_desc, argmax_desc, grad_out_desc,
grad_in_desc;
pts_idx_of_voxels_desc.set(pts_idx_of_voxels_contiguous);
argmax_desc.set(argmax_contiguous);
grad_out_desc.set(grad_out_contiguous);
grad_in_desc.set(grad_in_contiguous);
auto pts_idx_of_voxels_impl =
torch_mlu::getMluTensorImpl(pts_idx_of_voxels_contiguous);
auto argmax_impl = torch_mlu::getMluTensorImpl(argmax_contiguous);
auto grad_out_impl = torch_mlu::getMluTensorImpl(grad_out_contiguous);
auto grad_in_impl = torch_mlu::getMluTensorImpl(grad_in_contiguous);
auto pts_idx_of_voxels_ptr = pts_idx_of_voxels_impl->cnnlMalloc();
auto argmax_impl = torch_mlu::getMluTensorImpl(argmax);
auto argmax_ptr = argmax_impl->cnnlMalloc();
auto grad_out_impl = torch_mlu::getMluTensorImpl(grad_out);
auto grad_out_ptr = grad_out_impl->cnnlMalloc();
auto grad_in_impl = torch_mlu::getMluTensorImpl(grad_in);
auto grad_in_ptr = grad_in_impl->cnnlMalloc();
// get compute dtype of input
cnrtDataType_t data_type = torch_mlu::toCnrtDtype(grad_out.dtype());
// launch kernel RoiawarePool3dForward
CNLOG(INFO) << "Launch Kernel MLUKernel RoiawarePool3dBackward<<<" << k_dim.x
<< ", " << k_dim.y << ", " << k_dim.z << ">>>";
KernelRoiawarePool3dBackward(k_dim, k_type, queue, data_type, pool_method,
boxes_num, out_x, out_y, out_z, channels,
max_pts_each_voxel, (int *)pts_idx_of_voxels_ptr,
(int *)argmax_ptr, grad_out_ptr, grad_in_ptr);
CNLOG(INFO) << "Call mluOpRoiawarePool3dBackward().";
TORCH_MLUOP_CHECK(mluOpRoiawarePool3dBackward(
handle, pool_method, boxes_num, out_x, out_y, out_z, channels,
max_pts_each_voxel, pts_idx_of_voxels_desc.desc(), pts_idx_of_voxels_ptr,
argmax_desc.desc(), argmax_ptr, grad_out_desc.desc(), grad_out_ptr,
grad_in_desc.desc(), grad_in_ptr));
}
void roiaware_pool3d_backward_mlu(int boxes_num, int out_x, int out_y,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment