Unverified Commit 2e5628b4 authored by q.yao's avatar q.yao Committed by GitHub
Browse files

[Refactor]: Remove deployment for dev-2.x (#2225)

* remove deploy for 2.0

* update onnx ut
parent 961373ad
// Copyright (c) OpenMMLab. All rights reserved
#ifndef ORT_MMCV_UTILS_H
#define ORT_MMCV_UTILS_H
#include <onnxruntime_cxx_api.h>
#include <vector>
struct OrtTensorDimensions : std::vector<int64_t> {
OrtTensorDimensions(Ort::CustomOpApi ort, const OrtValue* value) {
OrtTensorTypeAndShapeInfo* info = ort.GetTensorTypeAndShape(value);
std::vector<int64_t>::operator=(ort.GetTensorShape(info));
ort.ReleaseTensorTypeAndShapeInfo(info);
}
};
#endif // ORT_MMCV_UTILS_H
// Copyright (c) OpenMMLab. All rights reserved
#ifndef ONNXRUNTIME_REDUCE_OPS_H
#define ONNXRUNTIME_REDUCE_OPS_H
#include <onnxruntime_cxx_api.h>
struct MMCVCumMaxKernel {
public:
MMCVCumMaxKernel(Ort::CustomOpApi ort, const OrtKernelInfo* info)
: ort_(ort) {
dim_ = ort_.KernelInfoGetAttribute<int64_t>(info, "dim");
// create allocator
allocator_ = Ort::AllocatorWithDefaultOptions();
}
void Compute(OrtKernelContext* context);
private:
Ort::CustomOpApi ort_;
Ort::AllocatorWithDefaultOptions allocator_;
int64_t dim_;
};
struct MMCVCumMinKernel {
public:
MMCVCumMinKernel(Ort::CustomOpApi ort, const OrtKernelInfo* info)
: ort_(ort) {
dim_ = ort_.KernelInfoGetAttribute<int64_t>(info, "dim");
// create allocator
allocator_ = Ort::AllocatorWithDefaultOptions();
}
void Compute(OrtKernelContext* context);
private:
Ort::CustomOpApi ort_;
Ort::AllocatorWithDefaultOptions allocator_;
int64_t dim_;
};
struct MMCVCumMaxCustomOp
: Ort::CustomOpBase<MMCVCumMaxCustomOp, MMCVCumMaxKernel> {
void* CreateKernel(Ort::CustomOpApi api, const OrtKernelInfo* info) const {
return new MMCVCumMaxKernel(api, info);
}
const char* GetName() const { return "cummax"; }
size_t GetInputTypeCount() const { return 1; }
ONNXTensorElementDataType GetInputType(size_t) const {
return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
};
size_t GetOutputTypeCount() const { return 2; }
ONNXTensorElementDataType GetOutputType(size_t index) const {
if (index == 1) return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
};
// force cpu
const char* GetExecutionProviderType() const {
return "CPUExecutionProvider";
};
};
struct MMCVCumMinCustomOp
: Ort::CustomOpBase<MMCVCumMinCustomOp, MMCVCumMinKernel> {
void* CreateKernel(Ort::CustomOpApi api, const OrtKernelInfo* info) const {
return new MMCVCumMinKernel(api, info);
}
const char* GetName() const { return "cummin"; }
size_t GetInputTypeCount() const { return 1; }
ONNXTensorElementDataType GetInputType(size_t) const {
return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
};
size_t GetOutputTypeCount() const { return 2; }
ONNXTensorElementDataType GetOutputType(size_t index) const {
if (index == 1) return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
};
// force cpu
const char* GetExecutionProviderType() const {
return "CPUExecutionProvider";
};
};
#endif // ONNXRUNTIME_REDUCE_OPS_H
// Copyright (c) OpenMMLab. All rights reserved
#ifndef ONNXRUNTIME_ROI_ALIGN_H
#define ONNXRUNTIME_ROI_ALIGN_H
#include <assert.h>
#include <onnxruntime_cxx_api.h>
#include <cmath>
#include <mutex>
#include <string>
#include <vector>
struct MMCVRoiAlignKernel {
public:
MMCVRoiAlignKernel(Ort::CustomOpApi ort, const OrtKernelInfo* info)
: ort_(ort) {
aligned_ = ort_.KernelInfoGetAttribute<int64_t>(info, "aligned");
aligned_height_ =
ort_.KernelInfoGetAttribute<int64_t>(info, "output_height");
aligned_width_ = ort_.KernelInfoGetAttribute<int64_t>(info, "output_width");
pool_mode_ = ort_.KernelInfoGetAttribute<std::string>(info, "mode");
sampling_ratio_ =
ort_.KernelInfoGetAttribute<int64_t>(info, "sampling_ratio");
spatial_scale_ = ort_.KernelInfoGetAttribute<float>(info, "spatial_scale");
}
void Compute(OrtKernelContext* context);
private:
Ort::CustomOpApi ort_;
int aligned_height_;
int aligned_width_;
float spatial_scale_;
int sampling_ratio_;
std::string pool_mode_;
int aligned_;
};
struct MMCVRoiAlignCustomOp
: Ort::CustomOpBase<MMCVRoiAlignCustomOp, MMCVRoiAlignKernel> {
void* CreateKernel(Ort::CustomOpApi api, const OrtKernelInfo* info) const {
return new MMCVRoiAlignKernel(api, info);
}
const char* GetName() const { return "MMCVRoiAlign"; }
size_t GetInputTypeCount() const { return 2; }
ONNXTensorElementDataType GetInputType(size_t) const {
return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
}
size_t GetOutputTypeCount() const { return 1; }
ONNXTensorElementDataType GetOutputType(size_t) const {
return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
}
// force cpu
const char* GetExecutionProviderType() const {
return "CPUExecutionProvider";
}
};
#endif // ONNXRUNTIME_ROI_ALIGN_H
// Copyright (c) OpenMMLab. All rights reserved
#ifndef ONNXRUNTIME_ROI_ALIGN_ROTATED_H
#define ONNXRUNTIME_ROI_ALIGN_ROTATED_H
#include <assert.h>
#include <onnxruntime_cxx_api.h>
#include <cmath>
#include <mutex>
#include <string>
#include <vector>
struct MMCVRoIAlignRotatedKernel {
public:
MMCVRoIAlignRotatedKernel(Ort::CustomOpApi ort, const OrtKernelInfo* info)
: ort_(ort) {
aligned_height_ =
ort_.KernelInfoGetAttribute<int64_t>(info, "output_height");
aligned_width_ = ort_.KernelInfoGetAttribute<int64_t>(info, "output_width");
sampling_ratio_ =
ort_.KernelInfoGetAttribute<int64_t>(info, "sampling_ratio");
spatial_scale_ = ort_.KernelInfoGetAttribute<float>(info, "spatial_scale");
aligned_ = ort_.KernelInfoGetAttribute<int64_t>(info, "aligned");
clockwise_ = ort_.KernelInfoGetAttribute<int64_t>(info, "clockwise");
}
void Compute(OrtKernelContext* context);
private:
Ort::CustomOpApi ort_;
int aligned_height_;
int aligned_width_;
float spatial_scale_;
int sampling_ratio_;
int aligned_;
int clockwise_;
};
struct MMCVRoIAlignRotatedCustomOp
: Ort::CustomOpBase<MMCVRoIAlignRotatedCustomOp,
MMCVRoIAlignRotatedKernel> {
void* CreateKernel(Ort::CustomOpApi api, const OrtKernelInfo* info) const {
return new MMCVRoIAlignRotatedKernel(api, info);
}
const char* GetName() const { return "MMCVRoIAlignRotated"; }
size_t GetInputTypeCount() const { return 2; }
ONNXTensorElementDataType GetInputType(size_t) const {
return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
}
size_t GetOutputTypeCount() const { return 1; }
ONNXTensorElementDataType GetOutputType(size_t) const {
return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
}
// force cpu
const char* GetExecutionProviderType() const {
return "CPUExecutionProvider";
}
};
#endif // ONNXRUNTIME_ROI_ALIGN_ROTATED_H
#ifndef ONNXRUNTIME_ROTATED_FEATURE_ALIGN_H
#define ONNXRUNTIME_ROTATED_FEATURE_ALIGN_H
#include <onnxruntime_cxx_api.h>
#include <cmath>
struct MMCVRotatedFeatureAlignKernel {
public:
MMCVRotatedFeatureAlignKernel(Ort::CustomOpApi ort, const OrtKernelInfo* info)
: ort_(ort) {
spatial_scale_ = ort_.KernelInfoGetAttribute<float>(info, "spatial_scale");
points_ = ort_.KernelInfoGetAttribute<int64_t>(info, "points");
}
void Compute(OrtKernelContext* context);
private:
Ort::CustomOpApi ort_;
float spatial_scale_;
int points_;
};
struct MMCVRotatedFeatureAlignCustomOp
: Ort::CustomOpBase<MMCVRotatedFeatureAlignCustomOp,
MMCVRotatedFeatureAlignKernel> {
void* CreateKernel(Ort::CustomOpApi api, const OrtKernelInfo* info) const {
return new MMCVRotatedFeatureAlignKernel(api, info);
}
const char* GetName() const { return "MMCVRotatedFeatureAlign"; }
size_t GetInputTypeCount() const { return 2; }
ONNXTensorElementDataType GetInputType(size_t) const {
return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
}
size_t GetOutputTypeCount() const { return 1; }
ONNXTensorElementDataType GetOutputType(size_t) const {
return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
}
// force cpu
const char* GetExecutionProviderType() const {
return "CPUExecutionProvider";
}
};
#endif // ONNXRUNTIME_ROTATED_FEATURE_ALIGN_H
// Copyright (c) OpenMMLab. All rights reserved
#ifndef ONNXRUNTIME_SOFT_NMS_H
#define ONNXRUNTIME_SOFT_NMS_H
#include <onnxruntime_cxx_api.h>
struct SoftNmsKernel {
SoftNmsKernel(OrtApi api, const OrtKernelInfo *info);
void Compute(OrtKernelContext *context);
protected:
OrtApi api_;
Ort::CustomOpApi ort_;
const OrtKernelInfo *info_;
Ort::AllocatorWithDefaultOptions allocator_;
float iou_threshold_;
float sigma_;
float min_score_;
int64_t method_;
int64_t offset_;
};
struct SoftNmsOp : Ort::CustomOpBase<SoftNmsOp, SoftNmsKernel> {
void *CreateKernel(OrtApi api, const OrtKernelInfo *info) const {
return new SoftNmsKernel(api, info);
};
const char *GetName() const { return "SoftNonMaxSuppression"; };
size_t GetInputTypeCount() const { return 2; };
ONNXTensorElementDataType GetInputType(size_t /*index*/) const {
return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
};
size_t GetOutputTypeCount() const { return 2; };
ONNXTensorElementDataType GetOutputType(size_t index) const {
if (index == 1) {
return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
}
return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
};
// force cpu
const char *GetExecutionProviderType() const {
return "CPUExecutionProvider";
};
};
#endif // ONNXRUNTIME_SOFT_NMS_H
// Copyright (c) OpenMMLab. All rights reserved
#include "trt_corner_pool.hpp"
#include <assert.h>
#include "trt_serialize.hpp"
void CornerPoolForwardLauncher_float(const float *input, float *output,
const int batch_size, const int channels,
const int height, const int width,
const int pool_type, cudaStream_t stream);
namespace {
static const char *PLUGIN_VERSION{"1"};
static const char *CORNER_POOL_PLUGIN_NAME{"MMCVCornerPool"};
} // namespace
CornerPoolPluginDynamic::CornerPoolPluginDynamic(const std::string &name,
TRT_CORNER_POOL_TYPE poolType)
: mLayerName(name), mPoolType(poolType) {}
CornerPoolPluginDynamic::CornerPoolPluginDynamic(const std::string name,
const void *data,
size_t length)
: mLayerName(name) {
deserialize_value(&data, &length, &mPoolType);
}
CornerPoolPluginDynamic::~CornerPoolPluginDynamic() {}
nvinfer1::IPluginV2DynamicExt *CornerPoolPluginDynamic::clone() const {
CornerPoolPluginDynamic *plugin =
new CornerPoolPluginDynamic(mLayerName, mPoolType);
plugin->setPluginNamespace(getPluginNamespace());
return plugin;
}
nvinfer1::DimsExprs CornerPoolPluginDynamic::getOutputDimensions(
int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
nvinfer1::IExprBuilder &exprBuilder) {
return inputs[0];
}
bool CornerPoolPluginDynamic::supportsFormatCombination(
int pos, const nvinfer1::PluginTensorDesc *inOut, int nbInputs,
int nbOutputs) {
switch (pos) {
// input[0]
case 0:
return inOut[pos].type == nvinfer1::DataType::kFLOAT &&
inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
// output[0]
case 1:
return inOut[pos].type == inOut[0].type &&
inOut[pos].format == inOut[0].format;
default:
return false;
}
}
void CornerPoolPluginDynamic::configurePlugin(
const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
const nvinfer1::DynamicPluginTensorDesc *outputs, int nbOutputs) {}
size_t CornerPoolPluginDynamic::getWorkspaceSize(
const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const {
int sizeof_dtype = mmcv::getElementSize(outputs[0].type);
}
int CornerPoolPluginDynamic::enqueue(
const nvinfer1::PluginTensorDesc *inputDesc,
const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
void *const *outputs, void *workSpace, cudaStream_t stream) {
const void *input = inputs[0];
void *output_value = outputs[0];
const int batch_size = inputDesc[0].dims.d[0];
const int channels = inputDesc[0].dims.d[1];
const int height = inputDesc[0].dims.d[2];
const int width = inputDesc[0].dims.d[3];
CornerPoolForwardLauncher_float((float *)input, (float *)output_value,
batch_size, channels, height, width,
int(mPoolType), stream);
return 0;
}
nvinfer1::DataType CornerPoolPluginDynamic::getOutputDataType(
int index, const nvinfer1::DataType *inputTypes, int nbInputs) const {
return inputTypes[0];
}
// IPluginV2 Methods
const char *CornerPoolPluginDynamic::getPluginType() const {
switch (mPoolType) {
case TRT_CORNER_POOL_TYPE::TRT_TOP_POOL:
case TRT_CORNER_POOL_TYPE::TRT_BOTTOM_POOL:
case TRT_CORNER_POOL_TYPE::TRT_LEFT_POOL:
case TRT_CORNER_POOL_TYPE::TRT_RIGHT_POOL:
return CORNER_POOL_PLUGIN_NAME;
default:
return "UnknownpoolType";
}
}
const char *CornerPoolPluginDynamic::getPluginVersion() const {
return PLUGIN_VERSION;
}
int CornerPoolPluginDynamic::getNbOutputs() const { return 1; }
int CornerPoolPluginDynamic::initialize() { return 0; }
void CornerPoolPluginDynamic::terminate() {}
size_t CornerPoolPluginDynamic::getSerializationSize() const {
return sizeof(mPoolType);
}
void CornerPoolPluginDynamic::serialize(void *buffer) const {
serialize_value(&buffer, mPoolType);
}
void CornerPoolPluginDynamic::destroy() {
// This gets called when the network containing plugin is destroyed
delete this;
}
void CornerPoolPluginDynamic::setPluginNamespace(const char *libNamespace) {
mNamespace = libNamespace;
}
const char *CornerPoolPluginDynamic::getPluginNamespace() const {
return mNamespace.c_str();
}
CornerPoolPluginDynamicCreator::CornerPoolPluginDynamicCreator() {
mPluginAttributes.clear();
mPluginAttributes.emplace_back(nvinfer1::PluginField("mode"));
mFC.nbFields = mPluginAttributes.size();
mFC.fields = mPluginAttributes.data();
}
const char *CornerPoolPluginDynamicCreator::getPluginName() const {
return CORNER_POOL_PLUGIN_NAME;
}
const char *CornerPoolPluginDynamicCreator::getPluginVersion() const {
return PLUGIN_VERSION;
}
const nvinfer1::PluginFieldCollection *
CornerPoolPluginDynamicCreator::getFieldNames() {
return &mFC;
}
nvinfer1::IPluginV2 *CornerPoolPluginDynamicCreator::createPlugin(
const char *name, const nvinfer1::PluginFieldCollection *fc) {
TRT_CORNER_POOL_TYPE poolType;
int poolMode = -1;
for (int i = 0; i < fc->nbFields; i++) {
if (fc->fields[i].data == nullptr) {
continue;
}
std::string field_name(fc->fields[i].name);
if (field_name.compare("mode") == 0) {
poolMode = static_cast<const int *>(fc->fields[i].data)[0];
}
}
assert(poolMode >= 0 && poolMode <= 3);
switch (poolMode) {
case 0:
poolType = TRT_CORNER_POOL_TYPE::TRT_TOP_POOL;
break;
case 1:
poolType = TRT_CORNER_POOL_TYPE::TRT_BOTTOM_POOL;
break;
case 2:
poolType = TRT_CORNER_POOL_TYPE::TRT_LEFT_POOL;
break;
case 3:
poolType = TRT_CORNER_POOL_TYPE::TRT_RIGHT_POOL;
break;
default:
break;
}
CornerPoolPluginDynamic *plugin = new CornerPoolPluginDynamic(name, poolType);
plugin->setPluginNamespace(getPluginNamespace());
return plugin;
}
nvinfer1::IPluginV2 *CornerPoolPluginDynamicCreator::deserializePlugin(
const char *name, const void *serialData, size_t serialLength) {
// This object will be deleted when the network is destroyed, which will
// call FCPluginDynamic::destroy()
auto plugin = new CornerPoolPluginDynamic(name, serialData, serialLength);
plugin->setPluginNamespace(getPluginNamespace());
return plugin;
}
void CornerPoolPluginDynamicCreator::setPluginNamespace(
const char *libNamespace) {
mNamespace = libNamespace;
}
const char *CornerPoolPluginDynamicCreator::getPluginNamespace() const {
return mNamespace.c_str();
}
// Copyright (c) OpenMMLab. All rights reserved
#include "common_cuda_helper.hpp"
#include "trt_cuda_helper.cuh"
#include "trt_plugin_helper.hpp"
template <typename scalar_t>
__global__ void top_bottom_pool_kernel(const scalar_t *input, scalar_t *output,
const int batch_size, const int channels,
const int height, const int width,
const int pool_type) {
const int nthreads = batch_size * channels * width;
CUDA_1D_KERNEL_LOOP(index, nthreads) {
int n_idx = index / (channels * width); // batch
int w_idx = index % width; // width
int c_idx = (index / width) % channels; // channels
int offset_n = n_idx * channels * width * height;
int offset_n_c = offset_n + c_idx * width * height;
int direction = -1; // in [-1, 1], default for TopPool
int index_start = height - 2; // default for TopPool
// pool_type in [0, 1]
if (pool_type == 0) {
// TopPool
// directly copy the most bottom value from input to output
output[offset_n_c + (height - 1) * width + w_idx] =
input[offset_n_c + (height - 1) * width + w_idx];
} else {
// BottomPool
// directly copy the most top value from input to output
output[offset_n_c + w_idx] = input[offset_n_c + w_idx];
index_start = 1;
direction = 1;
}
// do pool
for (int h = index_start; h >= 0 && h < height; h += direction) {
output[offset_n_c + h * width + w_idx] =
max(output[offset_n_c + (h - direction) * width + w_idx],
input[offset_n_c + h * width + w_idx]);
}
}
}
template <typename scalar_t>
__global__ void left_right_pool_kernel(const scalar_t *input, scalar_t *output,
const int batch_size, const int channels,
const int height, const int width,
const int pool_type) {
const int nthreads = batch_size * channels * height;
CUDA_1D_KERNEL_LOOP(index, nthreads) {
int n_idx = index / (channels * height); // batch
int h_idx = index % height; // height
int c_idx = (index / height) % channels; // channels
int offset_n = n_idx * channels * width * height;
int offset_n_c = offset_n + c_idx * width * height;
int offset_n_c_h = offset_n_c + h_idx * width;
int direction = -1; // in [-1, 1], default for LeftPool
int index_start = width - 2; // default for LeftPool
// pool_type in [2, 3]
if (pool_type == 2) {
// LeftPool
// directly copy the most right value from input to output
output[offset_n_c_h + width - 1] = input[offset_n_c_h + width - 1];
} else {
// RightPool
// directly copy the most left value from input to output
output[offset_n_c_h] = input[offset_n_c_h];
index_start = 1;
direction = 1;
}
// do pool
for (int w = index_start; w >= 0 && w < width; w += direction) {
output[offset_n_c_h + w] =
max(output[offset_n_c_h + w - direction], input[offset_n_c_h + w]);
}
}
}
template <typename scalar_t>
void CornerPoolForwardLauncher(const scalar_t *input, scalar_t *output,
const int batch_size, const int channels,
const int height, const int width,
const int pool_type, cudaStream_t stream) {
int nthreads = -1, col_block = -1;
switch (pool_type) {
case 0:
case 1:
nthreads = batch_size * channels * width;
col_block = GET_BLOCKS(nthreads, THREADS_PER_BLOCK);
top_bottom_pool_kernel<scalar_t>
<<<col_block, THREADS_PER_BLOCK, 0, stream>>>(
input, output, batch_size, channels, height, width, pool_type);
break;
case 2:
case 3:
nthreads = batch_size * channels * height;
col_block = GET_BLOCKS(nthreads, THREADS_PER_BLOCK);
left_right_pool_kernel<scalar_t>
<<<col_block, THREADS_PER_BLOCK, 0, stream>>>(
input, output, batch_size, channels, height, width, pool_type);
break;
}
}
void CornerPoolForwardLauncher_float(const float *input, float *output,
const int batch_size, const int channels,
const int height, const int width,
const int pool_type, cudaStream_t stream) {
CornerPoolForwardLauncher<float>(input, output, batch_size, channels, height,
width, pool_type, stream);
}
// Copyright (c) OpenMMLab. All rights reserved
#include <cublas_v2.h>
#include "common_cuda_helper.hpp"
#include "trt_cuda_helper.cuh"
#include "trt_plugin_helper.hpp"
using mmcv::TensorDesc;
template <class scalar_t>
__global__ void copy_permute_kernel(scalar_t *dst, const scalar_t *src, int n,
TensorDesc ts_src_stride,
TensorDesc ts_dst_stride,
TensorDesc ts_permute) {
const int src_dim = ts_src_stride.dim;
int *src_stride = &(ts_src_stride.stride[0]);
int *dst_stride = &(ts_dst_stride.stride[0]);
int *permute = &(ts_permute.shape[0]);
CUDA_1D_KERNEL_LOOP(index, n) {
size_t dst_index = index;
size_t src_index = 0;
for (int i = 0; i < src_dim; ++i) {
int dim_index = dst_index / dst_stride[i];
dst_index = dst_index % dst_stride[i];
src_index += dim_index * src_stride[permute[i]];
}
dst[index] = src[src_index];
}
}
template <class scalar_t>
void memcpyPermute(scalar_t *dst, const scalar_t *src, int *src_size,
int *permute, int src_dim, cudaStream_t stream) {
size_t copy_size = 1;
TensorDesc ts_permute;
memcpy(&(ts_permute.shape[0]), permute, src_dim * sizeof(int));
TensorDesc ts_src_stride;
TensorDesc ts_dst_stride;
ts_src_stride.dim = src_dim;
ts_dst_stride.dim = src_dim;
int *src_stride = &(ts_src_stride.stride[0]);
int *dst_stride = &(ts_dst_stride.stride[0]);
int *dst_size = &(ts_dst_stride.shape[0]);
src_stride[src_dim - 1] = 1;
dst_stride[src_dim - 1] = 1;
for (int i = src_dim - 1; i >= 0; --i) {
dst_size[i] = src_size[permute[i]];
if (i < src_dim - 1) {
src_stride[i] = src_stride[i + 1] * src_size[i + 1];
}
}
for (int i = src_dim - 1; i >= 0; --i) {
copy_size *= dst_size[i];
if (i < src_dim - 1) {
dst_stride[i] = dst_stride[i + 1] * dst_size[i + 1];
}
}
copy_permute_kernel<scalar_t>
<<<GET_BLOCKS(copy_size), THREADS_PER_BLOCK, 0, stream>>>(
dst, src, copy_size, ts_src_stride, ts_dst_stride, ts_permute);
}
template void memcpyPermute<float>(float *dst, const float *src, int *src_size,
int *permute, int src_dim,
cudaStream_t stream);
template <>
cublasStatus_t cublasGemmWrap<float>(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb, int m, int n,
int k, const float *alpha, const float *A,
int lda, const float *B, int ldb,
const float *beta, float *C, int ldc) {
return cublasSgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb,
beta, C, ldc);
}
template <>
cublasStatus_t cublasGemmWrap<half>(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb, int m, int n,
int k, const half *alpha, const half *A,
int lda, const half *B, int ldb,
const half *beta, half *C, int ldc) {
return cublasHgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb,
beta, C, ldc);
}
// Copyright (c) OpenMMLab. All rights reserved
#include "trt_cummaxmin.hpp"
#include <assert.h>
#include "trt_serialize.hpp"
void CumMaxMinForwardLauncher_float(const float *input, float *output_value,
int *output_index, const int *dims,
int nbDims, int cum_dim, int cum_type,
cudaStream_t stream);
void CumMaxMinForwardLauncher_int32(const int *input, int *output_value,
int *output_index, const int *dims,
int nbDims, int cum_dim, int cum_type,
cudaStream_t stream);
namespace {
static const char *PLUGIN_VERSION{"1"};
static const char *CUMMAXMIN_PLUGIN_NAME{"cummaxmin"};
static const char *CUMMAX_PLUGIN_NAME{"cummax"};
static const char *CUMMIN_PLUGIN_NAME{"cummin"};
} // namespace
CumMaxMinPluginDynamic::CumMaxMinPluginDynamic(const std::string &name, int dim,
TRT_CUMCMPTYPE cumType)
: mLayerName(name), mDim(dim), mCumType(cumType) {}
CumMaxMinPluginDynamic::CumMaxMinPluginDynamic(const std::string name,
const void *data, size_t length)
: mLayerName(name) {
deserialize_value(&data, &length, &mDim);
deserialize_value(&data, &length, &mCumType);
}
CumMaxMinPluginDynamic::~CumMaxMinPluginDynamic() {}
nvinfer1::IPluginV2DynamicExt *CumMaxMinPluginDynamic::clone() const {
CumMaxMinPluginDynamic *plugin =
new CumMaxMinPluginDynamic(mLayerName, mDim, mCumType);
plugin->setPluginNamespace(getPluginNamespace());
return plugin;
}
nvinfer1::DimsExprs CumMaxMinPluginDynamic::getOutputDimensions(
int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
nvinfer1::IExprBuilder &exprBuilder) {
return inputs[0];
}
bool CumMaxMinPluginDynamic::supportsFormatCombination(
int pos, const nvinfer1::PluginTensorDesc *inOut, int nbInputs,
int nbOutputs) {
switch (pos) {
// input[0]
case 0:
return (inOut[pos].type == nvinfer1::DataType::kFLOAT ||
inOut[pos].type == nvinfer1::DataType::kINT32) &&
inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
// output[0]
case 1:
return inOut[pos].type == inOut[0].type &&
inOut[pos].format == inOut[0].format;
// output[1]
case 2:
return inOut[pos].type == nvinfer1::DataType::kINT32 &&
inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
default:
return false;
}
}
void CumMaxMinPluginDynamic::configurePlugin(
const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
const nvinfer1::DynamicPluginTensorDesc *outputs, int nbOutputs) {}
size_t CumMaxMinPluginDynamic::getWorkspaceSize(
const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const {
int sizeof_dtype = mmcv::getElementSize(outputs[0].type);
}
int CumMaxMinPluginDynamic::enqueue(
const nvinfer1::PluginTensorDesc *inputDesc,
const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
void *const *outputs, void *workSpace, cudaStream_t stream) {
const void *input = inputs[0];
void *output_value = outputs[0];
int *output_index = (int *)outputs[1];
const int *dims = &(inputDesc[0].dims.d[0]);
int nbDims = inputDesc[0].dims.nbDims;
switch (inputDesc[0].type) {
case nvinfer1::DataType::kFLOAT:
CumMaxMinForwardLauncher_float((float *)input, (float *)output_value,
output_index, dims, nbDims, mDim,
int(mCumType), stream);
break;
case nvinfer1::DataType::kINT32:
CumMaxMinForwardLauncher_int32((int *)input, (int *)output_value,
output_index, dims, nbDims, mDim,
int(mCumType), stream);
break;
default:
break;
}
return 0;
}
nvinfer1::DataType CumMaxMinPluginDynamic::getOutputDataType(
int index, const nvinfer1::DataType *inputTypes, int nbInputs) const {
switch (index) {
case 0:
return inputTypes[0];
case 1:
return nvinfer1::DataType::kINT32;
default:
break;
}
}
// IPluginV2 Methods
const char *CumMaxMinPluginDynamic::getPluginType() const {
switch (mCumType) {
case TRT_CUMCMPTYPE::TRT_CUMMAX:
return CUMMAX_PLUGIN_NAME;
case TRT_CUMCMPTYPE::TRT_CUMMIN:
return CUMMIN_PLUGIN_NAME;
default:
return "UnknownCumType";
}
}
const char *CumMaxMinPluginDynamic::getPluginVersion() const {
return PLUGIN_VERSION;
}
int CumMaxMinPluginDynamic::getNbOutputs() const { return 2; }
int CumMaxMinPluginDynamic::initialize() { return 0; }
void CumMaxMinPluginDynamic::terminate() {}
size_t CumMaxMinPluginDynamic::getSerializationSize() const {
return sizeof(mDim) + sizeof(mCumType);
}
void CumMaxMinPluginDynamic::serialize(void *buffer) const {
serialize_value(&buffer, mDim);
serialize_value(&buffer, mCumType);
}
void CumMaxMinPluginDynamic::destroy() {
// This gets called when the network containing plugin is destroyed
delete this;
}
void CumMaxMinPluginDynamic::setPluginNamespace(const char *libNamespace) {
mNamespace = libNamespace;
}
const char *CumMaxMinPluginDynamic::getPluginNamespace() const {
return mNamespace.c_str();
}
CumMaxMinPluginDynamicCreator::CumMaxMinPluginDynamicCreator(
TRT_CUMCMPTYPE cumType)
: mCumType(cumType) {
mPluginAttributes.clear();
mPluginAttributes.emplace_back(nvinfer1::PluginField("dim"));
mFC.nbFields = mPluginAttributes.size();
mFC.fields = mPluginAttributes.data();
}
const char *CumMaxMinPluginDynamicCreator::getPluginName() const {
return CUMMAXMIN_PLUGIN_NAME;
}
const char *CumMaxMinPluginDynamicCreator::getPluginVersion() const {
return PLUGIN_VERSION;
}
const nvinfer1::PluginFieldCollection *
CumMaxMinPluginDynamicCreator::getFieldNames() {
return &mFC;
}
nvinfer1::IPluginV2 *CumMaxMinPluginDynamicCreator::createPlugin(
const char *name, const nvinfer1::PluginFieldCollection *fc) {
int dim = 0;
for (int i = 0; i < fc->nbFields; i++) {
if (fc->fields[i].data == nullptr) {
continue;
}
std::string field_name(fc->fields[i].name);
if (field_name.compare("dim") == 0) {
dim = static_cast<const int *>(fc->fields[i].data)[0];
}
}
CumMaxMinPluginDynamic *plugin =
new CumMaxMinPluginDynamic(name, dim, mCumType);
plugin->setPluginNamespace(getPluginNamespace());
return plugin;
}
nvinfer1::IPluginV2 *CumMaxMinPluginDynamicCreator::deserializePlugin(
const char *name, const void *serialData, size_t serialLength) {
// This object will be deleted when the network is destroyed, which will
// call FCPluginDynamic::destroy()
auto plugin = new CumMaxMinPluginDynamic(name, serialData, serialLength);
plugin->setPluginNamespace(getPluginNamespace());
return plugin;
}
void CumMaxMinPluginDynamicCreator::setPluginNamespace(
const char *libNamespace) {
mNamespace = libNamespace;
}
const char *CumMaxMinPluginDynamicCreator::getPluginNamespace() const {
return mNamespace.c_str();
}
CumMaxPluginDynamicCreator::CumMaxPluginDynamicCreator()
: CumMaxMinPluginDynamicCreator(TRT_CUMCMPTYPE::TRT_CUMMAX) {}
const char *CumMaxPluginDynamicCreator::getPluginName() const {
return CUMMAX_PLUGIN_NAME;
}
CumMinPluginDynamicCreator::CumMinPluginDynamicCreator()
: CumMaxMinPluginDynamicCreator(TRT_CUMCMPTYPE::TRT_CUMMIN) {}
const char *CumMinPluginDynamicCreator::getPluginName() const {
return CUMMIN_PLUGIN_NAME;
}
// Copyright (c) OpenMMLab. All rights reserved
#include "common_cuda_helper.hpp"
#include "trt_cuda_helper.cuh"
#include "trt_plugin_helper.hpp"
using mmcv::TensorDesc;
template <typename scalar_t>
__global__ void cummaxmin_kernel(const scalar_t *input, scalar_t *output_value,
int *output_index, TensorDesc tensor_desc,
int cum_dim, int cum_type) {
const size_t cum_size = tensor_desc.shape[cum_dim];
const size_t cum_stride = tensor_desc.stride[cum_dim];
const size_t data_size =
tensor_desc.stride[0] * tensor_desc.shape[0] / cum_size;
CUDA_1D_KERNEL_LOOP(index, data_size) {
size_t cum_offset =
index / cum_stride * (cum_size * cum_stride) + index % cum_stride;
int cum_index = 0;
auto cum_value = input[cum_offset];
output_value[cum_offset] = cum_value;
output_index[cum_offset] = cum_index;
for (size_t cum_index_current = 1; cum_index_current < cum_size;
++cum_index_current) {
cum_offset += cum_stride;
const auto cum_value_current = input[cum_offset];
switch (cum_type) {
case 0: // max
if (cum_value_current > cum_value) {
cum_value = cum_value_current;
cum_index = cum_index_current;
}
break;
case 1: // min
if (cum_value_current < cum_value) {
cum_value = cum_value_current;
cum_index = cum_index_current;
}
break;
}
output_value[cum_offset] = cum_value;
output_index[cum_offset] = cum_index;
}
}
}
template <typename scalar_t>
void CumMaxMinForwardLauncher(const scalar_t *input, scalar_t *output_value,
int *output_index, const int *dims, int nbDims,
int cum_dim, int cum_type, cudaStream_t stream) {
// fill tensordesc and initial
TensorDesc tensor_desc;
memset((void *)&tensor_desc, 0, sizeof(TensorDesc));
tensor_desc.dim = nbDims;
tensor_desc.shape[nbDims - 1] = dims[nbDims - 1];
tensor_desc.stride[nbDims - 1] = 1;
for (int i = nbDims - 2; i >= 0; --i) {
tensor_desc.shape[i] = dims[i];
tensor_desc.stride[i] = dims[i + 1] * tensor_desc.stride[i + 1];
}
// cum dim should be larger than 0
cum_dim = cum_dim >= 0 ? cum_dim : (nbDims + cum_dim);
const int data_size =
tensor_desc.stride[0] * tensor_desc.shape[0] / tensor_desc.shape[cum_dim];
const int col_block = GET_BLOCKS(data_size, THREADS_PER_BLOCK);
cummaxmin_kernel<scalar_t><<<col_block, THREADS_PER_BLOCK, 0, stream>>>(
input, output_value, output_index, tensor_desc, cum_dim, cum_type);
}
void CumMaxMinForwardLauncher_float(const float *input, float *output_value,
int *output_index, const int *dims,
int nbDims, int cum_dim, int cum_type,
cudaStream_t stream) {
CumMaxMinForwardLauncher<float>(input, output_value, output_index, dims,
nbDims, cum_dim, cum_type, stream);
}
void CumMaxMinForwardLauncher_int32(const int *input, int *output_value,
int *output_index, const int *dims,
int nbDims, int cum_dim, int cum_type,
cudaStream_t stream) {
CumMaxMinForwardLauncher<int>(input, output_value, output_index, dims, nbDims,
cum_dim, cum_type, stream);
}
// Copyright (c) OpenMMLab. All rights reserved
#include "trt_deform_conv.hpp"
#include <assert.h>
#include <chrono>
#include "trt_serialize.hpp"
void DeformConvForwardCUDAKernelLauncher_float(
const float *input, const float *weight, const float *offset, float *output,
void *workspace, int batchSize, int nInputPlane, int inputHeight,
int inputWidth, int nOutputPlane, int kW, int kH, int dW, int dH, int padW,
int padH, int dilationW, int dilationH, int group, int deformable_group,
int im2col_step, cublasHandle_t cublas_handle, cudaStream_t stream);
namespace {
static const char *PLUGIN_VERSION{"1"};
static const char *PLUGIN_NAME{"MMCVDeformConv2d"};
} // namespace
nvinfer1::PluginFieldCollection DeformableConvPluginDynamicCreator::mFC{};
std::vector<nvinfer1::PluginField>
DeformableConvPluginDynamicCreator::mPluginAttributes;
DeformableConvPluginDynamic::DeformableConvPluginDynamic(
const std::string &name, const nvinfer1::Dims &stride,
const nvinfer1::Dims &padding, const nvinfer1::Dims &dilation,
const int deformableGroup, const int group, int im2colStep)
: mLayerName(name),
mStride(stride),
mPadding(padding),
mDilation(dilation),
mDeformableGroup(deformableGroup),
mGroup(group),
mIm2colStep(im2colStep) {}
DeformableConvPluginDynamic::DeformableConvPluginDynamic(const std::string name,
const void *data,
size_t length)
: mLayerName(name) {
deserialize_value(&data, &length, &mStride);
deserialize_value(&data, &length, &mPadding);
deserialize_value(&data, &length, &mDilation);
deserialize_value(&data, &length, &mDeformableGroup);
deserialize_value(&data, &length, &mGroup);
deserialize_value(&data, &length, &mIm2colStep);
}
DeformableConvPluginDynamic::~DeformableConvPluginDynamic() {}
nvinfer1::IPluginV2DynamicExt *DeformableConvPluginDynamic::clone() const {
DeformableConvPluginDynamic *plugin =
new DeformableConvPluginDynamic(mLayerName, mStride, mPadding, mDilation,
mDeformableGroup, mGroup, mIm2colStep);
plugin->setPluginNamespace(getPluginNamespace());
return plugin;
}
nvinfer1::DimsExprs DeformableConvPluginDynamic::getOutputDimensions(
int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
nvinfer1::IExprBuilder &exprBuilder) {
nvinfer1::DimsExprs ret;
ret.nbDims = 4;
ret.d[0] = inputs[0].d[0];
ret.d[1] = inputs[2].d[0];
ret.d[2] = inputs[1].d[2];
ret.d[3] = inputs[1].d[3];
return ret;
}
bool DeformableConvPluginDynamic::supportsFormatCombination(
int pos, const nvinfer1::PluginTensorDesc *inOut, int nbInputs,
int nbOutputs) {
if (pos == 0) {
return (inOut[pos].type == nvinfer1::DataType::kFLOAT &&
inOut[pos].format == nvinfer1::TensorFormat::kLINEAR);
} else {
return inOut[pos].type == inOut[0].type &&
inOut[pos].format == inOut[0].format;
}
}
void DeformableConvPluginDynamic::configurePlugin(
const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
const nvinfer1::DynamicPluginTensorDesc *outputs, int nbOutputs) {}
size_t DeformableConvPluginDynamic::getWorkspaceSize(
const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const {
int sizeof_dtype = mmcv::getElementSize(outputs[0].type);
int batch_size = inputs[0].dims.d[0];
int nInputPlane = inputs[0].dims.d[1];
int inputHeight = inputs[0].dims.d[2];
int inputWidth = inputs[0].dims.d[3];
int nOutputPlane = outputs[0].dims.d[1];
int outputHeight = outputs[0].dims.d[2];
int outputWidth = outputs[0].dims.d[3];
int kW = inputs[2].dims.d[2];
int kH = inputs[2].dims.d[3];
int im2col_step = std::min(batch_size, mIm2colStep);
size_t col_size =
mmcv::getAlignedSize(nInputPlane * kW * kH * im2col_step * outputHeight *
outputWidth * sizeof_dtype);
size_t out_size = 0;
if (im2col_step != 1)
out_size = mmcv::getAlignedSize(batch_size * nOutputPlane * outputHeight *
outputWidth * sizeof_dtype);
return col_size + out_size;
}
int DeformableConvPluginDynamic::enqueue(
const nvinfer1::PluginTensorDesc *inputDesc,
const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
void *const *outputs, void *workSpace, cudaStream_t stream) {
int batch_size = inputDesc[0].dims.d[0];
int inputChannel = inputDesc[0].dims.d[1];
int inputHeight = inputDesc[0].dims.d[2];
int inputWidth = inputDesc[0].dims.d[3];
int outputChannel = outputDesc[0].dims.d[1];
int kernelHeight = inputDesc[2].dims.d[2];
int kernelWidth = inputDesc[2].dims.d[3];
const void *x = inputs[0];
const void *offset = inputs[1];
const void *weight = inputs[2];
void *output = outputs[0];
int im2col_step = std::min(batch_size, mIm2colStep);
// TODO: add fp16 support
auto data_type = inputDesc[0].type;
switch (data_type) {
case nvinfer1::DataType::kFLOAT:
DeformConvForwardCUDAKernelLauncher_float(
(float *)x, (float *)weight, (float *)offset, (float *)output,
workSpace, batch_size, inputChannel, inputHeight, inputWidth,
outputChannel, kernelWidth, kernelHeight, mStride.d[0], mStride.d[1],
mPadding.d[0], mPadding.d[1], mDilation.d[0], mDilation.d[1], mGroup,
mDeformableGroup, im2col_step, m_cublas_handle, stream);
break;
default:
return 1;
break;
}
return 0;
}
nvinfer1::DataType DeformableConvPluginDynamic::getOutputDataType(
int index, const nvinfer1::DataType *inputTypes, int nbInputs) const {
return inputTypes[0];
}
// IPluginV2 Methods
const char *DeformableConvPluginDynamic::getPluginType() const {
return PLUGIN_NAME;
}
const char *DeformableConvPluginDynamic::getPluginVersion() const {
return PLUGIN_VERSION;
}
int DeformableConvPluginDynamic::getNbOutputs() const { return 1; }
int DeformableConvPluginDynamic::initialize() { return 0; }
void DeformableConvPluginDynamic::terminate() {}
size_t DeformableConvPluginDynamic::getSerializationSize() const {
return sizeof(mStride) + sizeof(mPadding) + sizeof(mDilation) +
sizeof(mDeformableGroup) + sizeof(mGroup) + sizeof(mIm2colStep);
}
void DeformableConvPluginDynamic::serialize(void *buffer) const {
serialize_value(&buffer, mStride);
serialize_value(&buffer, mPadding);
serialize_value(&buffer, mDilation);
serialize_value(&buffer, mDeformableGroup);
serialize_value(&buffer, mGroup);
serialize_value(&buffer, mIm2colStep);
}
void DeformableConvPluginDynamic::destroy() {
// This gets called when the network containing plugin is destroyed
delete this;
}
void DeformableConvPluginDynamic::attachToContext(
cudnnContext *cudnnContext, cublasContext *cublasContext,
nvinfer1::IGpuAllocator *gpuAllocator) {
m_cublas_handle = cublasContext;
}
void DeformableConvPluginDynamic::detachFromContext() {}
void DeformableConvPluginDynamic::setPluginNamespace(const char *libNamespace) {
mNamespace = libNamespace;
}
const char *DeformableConvPluginDynamic::getPluginNamespace() const {
return mNamespace.c_str();
}
////////////////////// creator /////////////////////////////
DeformableConvPluginDynamicCreator::DeformableConvPluginDynamicCreator() {
mPluginAttributes.emplace_back(nvinfer1::PluginField("stride"));
mPluginAttributes.emplace_back(nvinfer1::PluginField("padding"));
mPluginAttributes.emplace_back(nvinfer1::PluginField("dilation"));
mPluginAttributes.emplace_back(nvinfer1::PluginField("groups"));
mPluginAttributes.emplace_back(nvinfer1::PluginField("deform_groups"));
mPluginAttributes.emplace_back(nvinfer1::PluginField("bias"));
mPluginAttributes.emplace_back(nvinfer1::PluginField("im2col_step"));
mFC.nbFields = mPluginAttributes.size();
mFC.fields = mPluginAttributes.data();
}
const char *DeformableConvPluginDynamicCreator::getPluginName() const {
return PLUGIN_NAME;
}
const char *DeformableConvPluginDynamicCreator::getPluginVersion() const {
return PLUGIN_VERSION;
}
const nvinfer1::PluginFieldCollection *
DeformableConvPluginDynamicCreator::getFieldNames() {
return &mFC;
}
nvinfer1::IPluginV2 *DeformableConvPluginDynamicCreator::createPlugin(
const char *name, const nvinfer1::PluginFieldCollection *fc) {
nvinfer1::Dims stride{2, {1, 1}};
nvinfer1::Dims padding{2, {0, 0}};
nvinfer1::Dims dilation{2, {1, 1}};
int deformableGroup = 1;
int group = 1;
int im2col_step = 32;
for (int i = 0; i < fc->nbFields; i++) {
if (fc->fields[i].data == nullptr) {
continue;
}
std::string field_name(fc->fields[i].name);
if (field_name.compare("stride") == 0) {
stride.nbDims = 2;
stride.d[0] = static_cast<const int *>(fc->fields[i].data)[0];
if (fc->fields[i].length == 1) {
stride.d[1] = stride.d[0];
} else {
stride.d[1] = static_cast<const int *>(fc->fields[i].data)[1];
}
}
if (field_name.compare("padding") == 0) {
padding.nbDims = 2;
padding.d[0] = static_cast<const int *>(fc->fields[i].data)[0];
if (fc->fields[i].length == 1) {
padding.d[1] = padding.d[0];
} else {
padding.d[1] = static_cast<const int *>(fc->fields[i].data)[1];
}
}
if (field_name.compare("dilation") == 0) {
dilation.nbDims = 2;
dilation.d[0] = static_cast<const int *>(fc->fields[i].data)[0];
if (fc->fields[i].length == 1) {
dilation.d[1] = dilation.d[0];
} else {
dilation.d[1] = static_cast<const int *>(fc->fields[i].data)[1];
}
}
if (field_name.compare("deform_groups") == 0) {
deformableGroup = static_cast<const int *>(fc->fields[i].data)[0];
}
if (field_name.compare("group") == 0) {
group = static_cast<const int *>(fc->fields[i].data)[0];
}
if (field_name.compare("im2col_step") == 0) {
im2col_step = static_cast<const int *>(fc->fields[i].data)[0];
}
}
DeformableConvPluginDynamic *plugin = new DeformableConvPluginDynamic(
name, stride, padding, dilation, deformableGroup, group, im2col_step);
plugin->setPluginNamespace(getPluginNamespace());
return plugin;
}
nvinfer1::IPluginV2 *DeformableConvPluginDynamicCreator::deserializePlugin(
const char *name, const void *serialData, size_t serialLength) {
auto plugin = new DeformableConvPluginDynamic(name, serialData, serialLength);
plugin->setPluginNamespace(getPluginNamespace());
return plugin;
}
void DeformableConvPluginDynamicCreator::setPluginNamespace(
const char *libNamespace) {
mNamespace = libNamespace;
}
const char *DeformableConvPluginDynamicCreator::getPluginNamespace() const {
return mNamespace.c_str();
}
// Copyright (c) OpenMMLab. All rights reserved
#include <cuda_fp16.h>
#include "common_cuda_helper.hpp"
#include "deform_conv_cuda_kernel.cuh"
#include "trt_cuda_helper.cuh"
#include "trt_plugin_helper.hpp"
template <typename T>
void trt_deformable_im2col(const T* data_input, const T* data_offset,
const int channels, const int height,
const int width, const int ksize_h,
const int ksize_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int parallel_imgs, const int deformable_group,
T* data_col, cudaStream_t stream) {
int height_col =
(height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
int width_col =
(width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
int num_kernels = channels * height_col * width_col * parallel_imgs;
int channel_per_deformable_group = channels / deformable_group;
deformable_im2col_gpu_kernel<T>
<<<GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>(
num_kernels, data_input, data_offset, height, width, ksize_h, ksize_w,
pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
channel_per_deformable_group, parallel_imgs, channels,
deformable_group, height_col, width_col, data_col);
cudaCheckError();
}
template <typename scalar_t>
void DeformConvForwardCUDAKernelLauncher(
const scalar_t* input, const scalar_t* weight, const scalar_t* offset,
scalar_t* output, void* workspace, int batchSize, int nInputPlane,
int inputHeight, int inputWidth, int nOutputPlane, int kW, int kH, int dW,
int dH, int padW, int padH, int dilationW, int dilationH, int group,
int deformable_group, int im2col_step, cublasHandle_t cublas_handle,
cudaStream_t stream) {
size_t word_size = sizeof(scalar_t);
im2col_step = std::min(int(batchSize), im2col_step);
long outputWidth =
(inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
long outputHeight =
(inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
long long columns_size =
mmcv::getAlignedSize(nInputPlane * kW * kH * im2col_step * outputHeight *
outputWidth * word_size);
// column buffer for img2col
scalar_t* columns = (scalar_t*)workspace;
workspace = workspace + columns_size;
scalar_t* output_buffer;
long long output_buffer_size = 0;
if (im2col_step == 1) {
output_buffer = output;
} else {
// output need permute when im2col_step!=1
output_buffer = (scalar_t*)workspace;
output_buffer_size = batchSize * nOutputPlane * outputWidth * outputHeight;
}
long long input_elt_step =
im2col_step * nInputPlane * inputHeight * inputWidth;
long long offset_elt_step =
im2col_step * deformable_group * 2 * kH * kW * outputHeight * outputWidth;
long long out_buffer_step =
nOutputPlane * im2col_step * outputHeight * outputWidth;
long long col_g_step =
nInputPlane * kW * kH / group * im2col_step * outputHeight * outputWidth;
long long weight_g_step =
nOutputPlane / group * nInputPlane / group * kH * kW;
long long out_buffer_g_step =
nOutputPlane / group * im2col_step * outputHeight * outputWidth;
int m = nOutputPlane / group;
int n = im2col_step * outputHeight * outputWidth;
int k = nInputPlane / group * kH * kW;
scalar_t alpha = 1.;
scalar_t beta = 0.;
for (int elt = 0; elt < batchSize / im2col_step; elt++) {
const scalar_t* input_start = input + elt * input_elt_step;
const scalar_t* offset_start = offset + elt * offset_elt_step;
trt_deformable_im2col<scalar_t>(input_start, offset_start, nInputPlane,
inputHeight, inputWidth, kH, kW, padH, padW,
dH, dW, dilationH, dilationW, im2col_step,
deformable_group, columns, stream);
for (int g = 0; g < group; ++g) {
const scalar_t* weight_start = weight + g * weight_g_step;
scalar_t* col_start = columns + g * col_g_step;
scalar_t* out_buffer_start =
output_buffer + elt * out_buffer_step + g * out_buffer_g_step;
cublasGemmWrap<scalar_t>(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k,
&alpha, col_start, n, weight_start, k, &beta,
out_buffer_start, n);
cudaCheckError();
}
}
if (im2col_step != 1) {
int output_buffer_shape[5] = {batchSize / im2col_step, nOutputPlane,
im2col_step, outputHeight, outputWidth};
int output_buffer_permute[5] = {0, 2, 1, 3, 4};
memcpyPermute<scalar_t>(output, output_buffer, &output_buffer_shape[0],
&output_buffer_permute[0], 5, stream);
}
}
void DeformConvForwardCUDAKernelLauncher_float(
const float* input, const float* weight, const float* offset, float* output,
void* workspace, int batchSize, int nInputPlane, int inputHeight,
int inputWidth, int nOutputPlane, int kW, int kH, int dW, int dH, int padW,
int padH, int dilationW, int dilationH, int group, int deformable_group,
int im2col_step, cublasHandle_t cublas_handle, cudaStream_t stream) {
DeformConvForwardCUDAKernelLauncher<float>(
input, weight, offset, output, workspace, batchSize, nInputPlane,
inputHeight, inputWidth, nOutputPlane, kW, kH, dW, dH, padW, padH,
dilationW, dilationH, group, deformable_group, im2col_step, cublas_handle,
stream);
}
// Copyright (c) OpenMMLab. All rights reserved
#include "trt_grid_sampler.hpp"
#include <assert.h>
#include <stdio.h>
#include <chrono>
#include "trt_serialize.hpp"
using mmcv::GridSamplerInterpolation;
using mmcv::GridSamplerPadding;
void grid_sample_float(float *output, const float *input, const float *grid,
int *output_dims, int *input_dims, int *grid_dims,
int nb_dims, GridSamplerInterpolation interp,
GridSamplerPadding padding, bool align_corners,
cudaStream_t stream);
namespace {
static const char *PLUGIN_VERSION{"1"};
static const char *PLUGIN_NAME{"grid_sampler"};
} // namespace
nvinfer1::PluginFieldCollection GridSamplerDynamicCreator::mFC{};
std::vector<nvinfer1::PluginField> GridSamplerDynamicCreator::mPluginAttributes;
GridSamplerDynamic::GridSamplerDynamic(const std::string &name, int mode,
int paddingMode, bool alignCorners)
: mLayerName(name),
mMode(mode),
mPaddingMode(paddingMode),
mAlignCorners(alignCorners) {}
GridSamplerDynamic::GridSamplerDynamic(const std::string name, const void *data,
size_t length)
: mLayerName(name) {
deserialize_value(&data, &length, &mMode);
deserialize_value(&data, &length, &mPaddingMode);
deserialize_value(&data, &length, &mAlignCorners);
}
nvinfer1::IPluginV2DynamicExt *GridSamplerDynamic::clone() const {
GridSamplerDynamic *plugin =
new GridSamplerDynamic(mLayerName, mMode, mPaddingMode, mAlignCorners);
plugin->setPluginNamespace(getPluginNamespace());
return plugin;
}
nvinfer1::DimsExprs GridSamplerDynamic::getOutputDimensions(
int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
nvinfer1::IExprBuilder &exprBuilder) {
nvinfer1::DimsExprs ret;
ret.nbDims = inputs[0].nbDims;
ret.d[0] = inputs[0].d[0];
ret.d[1] = inputs[0].d[1];
for (int i = 2; i < ret.nbDims; ++i) {
ret.d[i] = inputs[1].d[i - 1];
}
return ret;
}
bool GridSamplerDynamic::supportsFormatCombination(
int pos, const nvinfer1::PluginTensorDesc *inOut, int nbInputs,
int nbOutputs) {
if (pos == 0) {
return (inOut[pos].type == nvinfer1::DataType::kFLOAT &&
inOut[pos].format == nvinfer1::TensorFormat::kLINEAR);
} else {
return inOut[pos].type == inOut[0].type &&
inOut[pos].format == inOut[0].format;
}
}
void GridSamplerDynamic::configurePlugin(
const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
const nvinfer1::DynamicPluginTensorDesc *outputs, int nbOutputs) {
// Validate input arguments
}
size_t GridSamplerDynamic::getWorkspaceSize(
const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const {
return 0;
}
int GridSamplerDynamic::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
const nvinfer1::PluginTensorDesc *outputDesc,
const void *const *inputs, void *const *outputs,
void *workSpace, cudaStream_t stream) {
nvinfer1::Dims input_dims = inputDesc[0].dims;
nvinfer1::Dims grid_dims = inputDesc[1].dims;
nvinfer1::Dims output_dims = outputDesc[0].dims;
using mmcv::GridSamplerInterpolation;
using mmcv::GridSamplerPadding;
GridSamplerInterpolation interp_mode = GridSamplerInterpolation::Bilinear;
switch (mMode) {
case 0:
interp_mode = GridSamplerInterpolation::Bilinear;
break;
case 1:
interp_mode = GridSamplerInterpolation::Nearest;
break;
default:
break;
}
GridSamplerPadding padding_mode = GridSamplerPadding::Zeros;
switch (mPaddingMode) {
case 0:
padding_mode = GridSamplerPadding::Zeros;
break;
case 1:
padding_mode = GridSamplerPadding::Border;
break;
case 2:
padding_mode = GridSamplerPadding::Reflection;
break;
default:
break;
}
auto data_type = inputDesc[0].type;
switch (data_type) {
case nvinfer1::DataType::kFLOAT:
grid_sample_float(
(float *)outputs[0], (float *)inputs[0], (float *)inputs[1],
&(output_dims.d[0]), &(input_dims.d[0]), &(grid_dims.d[0]),
input_dims.nbDims, interp_mode, padding_mode, mAlignCorners, stream);
break;
default:
return 1;
break;
}
return 0;
}
nvinfer1::DataType GridSamplerDynamic::getOutputDataType(
int index, const nvinfer1::DataType *inputTypes, int nbInputs) const {
return inputTypes[0];
}
// IPluginV2 Methods
const char *GridSamplerDynamic::getPluginType() const { return PLUGIN_NAME; }
const char *GridSamplerDynamic::getPluginVersion() const {
return PLUGIN_VERSION;
}
int GridSamplerDynamic::getNbOutputs() const { return 1; }
int GridSamplerDynamic::initialize() { return 0; }
void GridSamplerDynamic::terminate() {}
size_t GridSamplerDynamic::getSerializationSize() const {
return sizeof(mMode) + sizeof(mPaddingMode) + sizeof(mAlignCorners);
}
void GridSamplerDynamic::serialize(void *buffer) const {
serialize_value(&buffer, mMode);
serialize_value(&buffer, mPaddingMode);
serialize_value(&buffer, mAlignCorners);
}
void GridSamplerDynamic::destroy() {
// This gets called when the network containing plugin is destroyed
delete this;
}
void GridSamplerDynamic::setPluginNamespace(const char *libNamespace) {
mNamespace = libNamespace;
}
const char *GridSamplerDynamic::getPluginNamespace() const {
return mNamespace.c_str();
}
////////////////////// creator /////////////////////////////
GridSamplerDynamicCreator::GridSamplerDynamicCreator() {
mPluginAttributes.clear();
mPluginAttributes.emplace_back(nvinfer1::PluginField("interpolation_mode"));
mPluginAttributes.emplace_back(nvinfer1::PluginField("padding_mode"));
mPluginAttributes.emplace_back(nvinfer1::PluginField("align_corners"));
mFC.nbFields = mPluginAttributes.size();
mFC.fields = mPluginAttributes.data();
}
const char *GridSamplerDynamicCreator::getPluginName() const {
return PLUGIN_NAME;
}
const char *GridSamplerDynamicCreator::getPluginVersion() const {
return PLUGIN_VERSION;
}
const nvinfer1::PluginFieldCollection *
GridSamplerDynamicCreator::getFieldNames() {
return &mFC;
}
nvinfer1::IPluginV2 *GridSamplerDynamicCreator::createPlugin(
const char *name, const nvinfer1::PluginFieldCollection *fc) {
int mode = 0;
int paddingMode = 0;
bool alignCorners = false;
for (int i = 0; i < fc->nbFields; i++) {
if (fc->fields[i].data == nullptr) {
continue;
}
std::string field_name(fc->fields[i].name);
if (field_name.compare("interpolation_mode") == 0) {
mode = static_cast<const int *>(fc->fields[i].data)[0];
}
if (field_name.compare("padding_mode") == 0) {
paddingMode = static_cast<const int *>(fc->fields[i].data)[0];
}
if (field_name.compare("align_corners") == 0) {
alignCorners = (bool)(static_cast<const int *>(fc->fields[i].data)[0]);
}
}
GridSamplerDynamic *plugin =
new GridSamplerDynamic(name, mode, paddingMode, alignCorners);
plugin->setPluginNamespace(getPluginNamespace());
return plugin;
}
nvinfer1::IPluginV2 *GridSamplerDynamicCreator::deserializePlugin(
const char *name, const void *serialData, size_t serialLength) {
// This object will be deleted when the network is destroyed, which will
// call FCPluginDynamic::destroy()
auto plugin = new GridSamplerDynamic(name, serialData, serialLength);
plugin->setPluginNamespace(getPluginNamespace());
return plugin;
}
void GridSamplerDynamicCreator::setPluginNamespace(const char *libNamespace) {
mNamespace = libNamespace;
}
const char *GridSamplerDynamicCreator::getPluginNamespace() const {
return mNamespace.c_str();
}
// Copyright (c) OpenMMLab. All rights reserved
// modified from
// https://github.com/pytorch/pytorch/blob/ec683299ebabf297a3504c76248d37be830e4342/aten/src/ATen/native/cuda/GridSampler.cuh
// and
// https://github.com/pytorch/pytorch/blob/ec683299ebabf297a3504c76248d37be830e4342/aten/src/ATen/native/cuda/GridSampler.cu
#include <cuda_fp16.h>
#include <stdio.h>
#include <algorithm>
#include <cmath>
#include <vector>
#include "common_cuda_helper.hpp"
#include "trt_cuda_helper.cuh"
#include "trt_grid_sampler.hpp"
#include "trt_plugin_helper.hpp"
using mmcv::GridSamplerInterpolation;
using mmcv::GridSamplerPadding;
using mmcv::TensorDesc;
// Unnormalizes a coordinate from the -1 to +1 scale to its pixel index value,
// where we view each pixel as an area between (idx - 0.5) and (idx + 0.5).
// if align_corners: -1 and +1 get sent to the centers of the corner pixels
// -1 --> 0
// +1 --> (size - 1)
// scale_factor = (size - 1) / 2
// if not align_corners: -1 and +1 get sent to the image edges
// -1 --> -0.5
// +1 --> (size - 1) + 0.5 == size - 0.5
// scale_factor = size / 2
template <typename scalar_t>
static __forceinline__ __device__ scalar_t
grid_sampler_unnormalize(scalar_t coord, int size, bool align_corners) {
if (align_corners) {
// unnormalize coord from [-1, 1] to [0, size - 1]
return ((coord + 1.f) / 2) * (size - 1);
} else {
// unnormalize coord from [-1, 1] to [-0.5, size - 0.5]
return ((coord + 1.f) * size - 1) / 2;
}
}
// Clips coordinates to between 0 and clip_limit - 1
template <typename scalar_t>
static __forceinline__ __device__ scalar_t clip_coordinates(scalar_t in,
int clip_limit) {
return ::min(static_cast<scalar_t>(clip_limit - 1),
::max(in, static_cast<scalar_t>(0)));
}
// Reflects coordinates until they fall between low and high (inclusive).
// The bounds are passed as twice their value so that half-integer values
// can be represented as ints.
template <typename scalar_t>
static __forceinline__ __device__ scalar_t reflect_coordinates(scalar_t in,
int twice_low,
int twice_high) {
if (twice_low == twice_high) {
return static_cast<scalar_t>(0);
}
scalar_t min = static_cast<scalar_t>(twice_low) / 2;
scalar_t span = static_cast<scalar_t>(twice_high - twice_low) / 2;
in = ::fabs(in - min);
// `fmod` returns same sign as `in`, which is positive after the `fabs` above.
scalar_t extra = ::fmod(in, span);
int flips = static_cast<int>(::floor(in / span));
if (flips % 2 == 0) {
return extra + min;
} else {
return span - extra + min;
}
}
template <typename scalar_t>
static __forceinline__ __device__ scalar_t
safe_downgrade_to_int_range(scalar_t x) {
// -100.0 does not have special meaning. This is just to make sure
// it's not within_bounds_2d or within_bounds_3d, and does not cause
// undefined behavior. See #35506.
if (x > INT_MAX - 1 || x < INT_MIN || !::isfinite(static_cast<double>(x)))
return static_cast<scalar_t>(-100.0);
return x;
}
// Computes the pixel source index value for a grid coordinate
template <typename scalar_t>
static __forceinline__ __device__ scalar_t grid_sampler_compute_source_index(
scalar_t coord, int size, GridSamplerPadding padding_mode,
bool align_corners) {
coord = grid_sampler_unnormalize(coord, size, align_corners);
if (padding_mode == GridSamplerPadding::Border) {
// clip coordinates to image borders
coord = clip_coordinates(coord, size);
} else if (padding_mode == GridSamplerPadding::Reflection) {
// reflect coordinates by image borders
if (align_corners) {
coord = reflect_coordinates(coord, 0, 2 * (size - 1));
} else {
coord = reflect_coordinates(coord, -1, 2 * size - 1);
}
// clip coordinates to image borders
coord = clip_coordinates(coord, size);
}
coord = safe_downgrade_to_int_range(coord);
return coord;
}
static __forceinline__ __device__ bool within_bounds_2d(int h, int w, int H,
int W) {
return h >= 0 && h < H && w >= 0 && w < W;
}
static __forceinline__ __device__ bool within_bounds_3d(int d, int h, int w,
int D, int H, int W) {
return d >= 0 && d < D && h >= 0 && h < H && w >= 0 && w < W;
}
template <typename scalar_t>
__global__ void grid_sampler_2d_kernel(
const int nthreads, const scalar_t *input, const scalar_t *grid,
scalar_t *output, TensorDesc input_desc, TensorDesc grid_desc,
TensorDesc output_desc, const GridSamplerInterpolation interpolation_mode,
const GridSamplerPadding padding_mode, bool align_corners) {
int C = input_desc.shape[1];
int inp_H = input_desc.shape[2];
int inp_W = input_desc.shape[3];
int out_H = grid_desc.shape[1];
int out_W = grid_desc.shape[2];
int inp_sN = input_desc.stride[0];
int inp_sC = input_desc.stride[1];
int inp_sH = input_desc.stride[2];
int inp_sW = input_desc.stride[3];
int grid_sN = grid_desc.stride[0];
int grid_sH = grid_desc.stride[1];
int grid_sW = grid_desc.stride[2];
int grid_sCoor = grid_desc.stride[3];
int out_sN = output_desc.stride[0];
int out_sC = output_desc.stride[1];
int out_sH = output_desc.stride[2];
int out_sW = output_desc.stride[3];
CUDA_1D_KERNEL_LOOP(index, nthreads) {
const int w = index % out_W;
const int h = (index / out_W) % out_H;
const int n = index / (out_H * out_W);
const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
// get the corresponding input x, y coordinates from grid
scalar_t ix = grid[grid_offset];
scalar_t iy = grid[grid_offset + grid_sCoor];
ix = grid_sampler_compute_source_index(ix, inp_W, padding_mode,
align_corners);
iy = grid_sampler_compute_source_index(iy, inp_H, padding_mode,
align_corners);
if (interpolation_mode == GridSamplerInterpolation::Bilinear) {
// get NE, NW, SE, SW pixel values from (x, y)
int ix_nw = static_cast<int>(::floor(ix));
int iy_nw = static_cast<int>(::floor(iy));
int ix_ne = ix_nw + 1;
int iy_ne = iy_nw;
int ix_sw = ix_nw;
int iy_sw = iy_nw + 1;
int ix_se = ix_nw + 1;
int iy_se = iy_nw + 1;
// get surfaces to each neighbor:
scalar_t nw = (ix_se - ix) * (iy_se - iy);
scalar_t ne = (ix - ix_sw) * (iy_sw - iy);
scalar_t sw = (ix_ne - ix) * (iy - iy_ne);
scalar_t se = (ix - ix_nw) * (iy - iy_nw);
// calculate bilinear weighted pixel value and set output pixel
auto inp_ptr_NC = input + n * inp_sN;
auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW;
for (int c = 0; c < C;
++c, inp_ptr_NC += inp_sC, out_ptr_NCHW += out_sC) {
*out_ptr_NCHW = static_cast<scalar_t>(0);
if (within_bounds_2d(iy_nw, ix_nw, inp_H, inp_W)) {
*out_ptr_NCHW += inp_ptr_NC[iy_nw * inp_sH + ix_nw * inp_sW] * nw;
}
if (within_bounds_2d(iy_ne, ix_ne, inp_H, inp_W)) {
*out_ptr_NCHW += inp_ptr_NC[iy_ne * inp_sH + ix_ne * inp_sW] * ne;
}
if (within_bounds_2d(iy_sw, ix_sw, inp_H, inp_W)) {
*out_ptr_NCHW += inp_ptr_NC[iy_sw * inp_sH + ix_sw * inp_sW] * sw;
}
if (within_bounds_2d(iy_se, ix_se, inp_H, inp_W)) {
*out_ptr_NCHW += inp_ptr_NC[iy_se * inp_sH + ix_se * inp_sW] * se;
}
}
} else if (interpolation_mode == GridSamplerInterpolation::Nearest) {
int ix_nearest = static_cast<int>(::round(ix));
int iy_nearest = static_cast<int>(::round(iy));
// assign nearest neighbor pixel value to output pixel
auto inp_ptr_NC = input + n * inp_sN;
auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW;
for (int c = 0; c < C;
++c, inp_ptr_NC += inp_sC, out_ptr_NCHW += out_sC) {
if (within_bounds_2d(iy_nearest, ix_nearest, inp_H, inp_W)) {
*out_ptr_NCHW = inp_ptr_NC[iy_nearest * inp_sH + ix_nearest * inp_sW];
} else {
*out_ptr_NCHW = static_cast<scalar_t>(0);
}
}
}
}
}
template <typename scalar_t>
__global__ void grid_sampler_3d_kernel(
const int nthreads, const scalar_t *input, const scalar_t *grid,
scalar_t *output, TensorDesc input_desc, TensorDesc grid_desc,
TensorDesc output_desc, const GridSamplerInterpolation interpolation_mode,
const GridSamplerPadding padding_mode, bool align_corners) {
int C = input_desc.shape[1];
int inp_D = input_desc.shape[2];
int inp_H = input_desc.shape[3];
int inp_W = input_desc.shape[4];
int out_D = grid_desc.shape[1];
int out_H = grid_desc.shape[2];
int out_W = grid_desc.shape[3];
int inp_sN = input_desc.stride[0];
int inp_sC = input_desc.stride[1];
int inp_sD = input_desc.stride[2];
int inp_sH = input_desc.stride[3];
int inp_sW = input_desc.stride[4];
int grid_sN = grid_desc.stride[0];
int grid_sD = grid_desc.stride[1];
int grid_sH = grid_desc.stride[2];
int grid_sW = grid_desc.stride[3];
int grid_sCoor = grid_desc.stride[4];
int out_sN = output_desc.stride[0];
int out_sC = output_desc.stride[1];
int out_sD = output_desc.stride[2];
int out_sH = output_desc.stride[3];
int out_sW = output_desc.stride[4];
CUDA_1D_KERNEL_LOOP(index, nthreads) {
const int w = index % out_W;
const int h = (index / out_W) % out_H;
const int d = (index / (out_H * out_W)) % out_D;
const int n = index / (out_D * out_H * out_W);
const int grid_offset =
n * grid_sN + d * grid_sD + h * grid_sH + w * grid_sW;
// get the corresponding input x, y, z coordinates from grid
scalar_t ix = grid[grid_offset];
scalar_t iy = grid[grid_offset + grid_sCoor];
scalar_t iz = grid[grid_offset + 2 * grid_sCoor];
ix = grid_sampler_compute_source_index(ix, inp_W, padding_mode,
align_corners);
iy = grid_sampler_compute_source_index(iy, inp_H, padding_mode,
align_corners);
iz = grid_sampler_compute_source_index(iz, inp_D, padding_mode,
align_corners);
if (interpolation_mode == GridSamplerInterpolation::Bilinear) {
// get corner pixel values from (x, y, z)
// for 4d, we used north-east-south-west
// for 5d, we add top-bottom
int ix_tnw = static_cast<int>(::floor(ix));
int iy_tnw = static_cast<int>(::floor(iy));
int iz_tnw = static_cast<int>(::floor(iz));
int ix_tne = ix_tnw + 1;
int iy_tne = iy_tnw;
int iz_tne = iz_tnw;
int ix_tsw = ix_tnw;
int iy_tsw = iy_tnw + 1;
int iz_tsw = iz_tnw;
int ix_tse = ix_tnw + 1;
int iy_tse = iy_tnw + 1;
int iz_tse = iz_tnw;
int ix_bnw = ix_tnw;
int iy_bnw = iy_tnw;
int iz_bnw = iz_tnw + 1;
int ix_bne = ix_tnw + 1;
int iy_bne = iy_tnw;
int iz_bne = iz_tnw + 1;
int ix_bsw = ix_tnw;
int iy_bsw = iy_tnw + 1;
int iz_bsw = iz_tnw + 1;
int ix_bse = ix_tnw + 1;
int iy_bse = iy_tnw + 1;
int iz_bse = iz_tnw + 1;
// get surfaces to each neighbor:
scalar_t tnw = (ix_bse - ix) * (iy_bse - iy) * (iz_bse - iz);
scalar_t tne = (ix - ix_bsw) * (iy_bsw - iy) * (iz_bsw - iz);
scalar_t tsw = (ix_bne - ix) * (iy - iy_bne) * (iz_bne - iz);
scalar_t tse = (ix - ix_bnw) * (iy - iy_bnw) * (iz_bnw - iz);
scalar_t bnw = (ix_tse - ix) * (iy_tse - iy) * (iz - iz_tse);
scalar_t bne = (ix - ix_tsw) * (iy_tsw - iy) * (iz - iz_tsw);
scalar_t bsw = (ix_tne - ix) * (iy - iy_tne) * (iz - iz_tne);
scalar_t bse = (ix - ix_tnw) * (iy - iy_tnw) * (iz - iz_tnw);
auto inp_ptr_NC = input + n * inp_sN;
auto out_ptr_NCDHW =
output + n * out_sN + d * out_sD + h * out_sH + w * out_sW;
for (int c = 0; c < C;
++c, inp_ptr_NC += inp_sC, out_ptr_NCDHW += out_sC) {
// (c, iz_tnw, iy_tnw, ix_tnw) * tnw + (c, iz_tne, iy_tne, ix_tne) *
// tne
// + (c, iz_tsw, iy_tsw, ix_tsw) * tsw + (c, iz_tse, iy_tse, ix_tse) *
// tse
// + (c, iz_bnw, iy_bnw, ix_bnw) * bnw + (c, iz_bne, iy_bne, ix_bne) *
// bne
// + (c, iz_bsw, iy_bsw, ix_bsw) * bsw + (c, iz_bse, iy_bse, ix_bse) *
// bse
*out_ptr_NCDHW = static_cast<scalar_t>(0);
if (within_bounds_3d(iz_tnw, iy_tnw, ix_tnw, inp_D, inp_H, inp_W)) {
*out_ptr_NCDHW +=
inp_ptr_NC[iz_tnw * inp_sD + iy_tnw * inp_sH + ix_tnw * inp_sW] *
tnw;
}
if (within_bounds_3d(iz_tne, iy_tne, ix_tne, inp_D, inp_H, inp_W)) {
*out_ptr_NCDHW +=
inp_ptr_NC[iz_tne * inp_sD + iy_tne * inp_sH + ix_tne * inp_sW] *
tne;
}
if (within_bounds_3d(iz_tsw, iy_tsw, ix_tsw, inp_D, inp_H, inp_W)) {
*out_ptr_NCDHW +=
inp_ptr_NC[iz_tsw * inp_sD + iy_tsw * inp_sH + ix_tsw * inp_sW] *
tsw;
}
if (within_bounds_3d(iz_tse, iy_tse, ix_tse, inp_D, inp_H, inp_W)) {
*out_ptr_NCDHW +=
inp_ptr_NC[iz_tse * inp_sD + iy_tse * inp_sH + ix_tse * inp_sW] *
tse;
}
if (within_bounds_3d(iz_bnw, iy_bnw, ix_bnw, inp_D, inp_H, inp_W)) {
*out_ptr_NCDHW +=
inp_ptr_NC[iz_bnw * inp_sD + iy_bnw * inp_sH + ix_bnw * inp_sW] *
bnw;
}
if (within_bounds_3d(iz_bne, iy_bne, ix_bne, inp_D, inp_H, inp_W)) {
*out_ptr_NCDHW +=
inp_ptr_NC[iz_bne * inp_sD + iy_bne * inp_sH + ix_bne * inp_sW] *
bne;
}
if (within_bounds_3d(iz_bsw, iy_bsw, ix_bsw, inp_D, inp_H, inp_W)) {
*out_ptr_NCDHW +=
inp_ptr_NC[iz_bsw * inp_sD + iy_bsw * inp_sH + ix_bsw * inp_sW] *
bsw;
}
if (within_bounds_3d(iz_bse, iy_bse, ix_bse, inp_D, inp_H, inp_W)) {
*out_ptr_NCDHW +=
inp_ptr_NC[iz_bse * inp_sD + iy_bse * inp_sH + ix_bse * inp_sW] *
bse;
}
}
} else if (interpolation_mode == GridSamplerInterpolation::Nearest) {
int ix_nearest = static_cast<int>(::round(ix));
int iy_nearest = static_cast<int>(::round(iy));
int iz_nearest = static_cast<int>(::round(iz));
// assign nearest neighbor pixel value to output pixel
auto inp_ptr_NC = input + n * inp_sN;
auto out_ptr_NCDHW =
output + n * out_sN + d * out_sD + h * out_sH + w * out_sW;
for (int c = 0; c < C;
++c, inp_ptr_NC += inp_sC, out_ptr_NCDHW += out_sC) {
if (within_bounds_3d(iz_nearest, iy_nearest, ix_nearest, inp_D, inp_H,
inp_W)) {
*out_ptr_NCDHW =
inp_ptr_NC[iz_nearest * inp_sD + iy_nearest * inp_sH +
ix_nearest * inp_sW];
} else {
*out_ptr_NCDHW = static_cast<scalar_t>(0);
}
}
}
}
}
void create_desc(const int *dims, int nb_dims, TensorDesc &desc) {
memcpy(&desc.shape[0], dims, sizeof(int) * nb_dims);
desc.stride[nb_dims - 1] = 1;
for (int i = nb_dims - 2; i >= 0; --i) {
desc.stride[i] = desc.stride[i + 1] * desc.shape[i + 1];
}
}
template <typename T>
void grid_sample(T *output, const T *input, const T *grid, int *output_dims,
int *input_dims, int *grid_dims, int nb_dims,
GridSamplerInterpolation interp, GridSamplerPadding padding,
bool align_corners, cudaStream_t stream) {
TensorDesc input_desc;
create_desc(input_dims, nb_dims, input_desc);
TensorDesc output_desc;
create_desc(output_dims, nb_dims, output_desc);
TensorDesc grid_desc;
create_desc(grid_dims, nb_dims, grid_desc);
int count = 1;
for (int i = 0; i < nb_dims; ++i) {
if (i == 1) {
continue;
}
count *= output_desc.shape[i];
}
if (nb_dims == 4) {
grid_sampler_2d_kernel<T>
<<<GET_BLOCKS(count), THREADS_PER_BLOCK, 0, stream>>>(
count, input, grid, output, input_desc, grid_desc, output_desc,
interp, padding, align_corners);
} else if (nb_dims == 5) {
grid_sampler_3d_kernel<T>
<<<GET_BLOCKS(count), THREADS_PER_BLOCK, 0, stream>>>(
count, input, grid, output, input_desc, grid_desc, output_desc,
interp, padding, align_corners);
} else {
printf("input and grid dims should be 4 or 5\n");
}
}
void grid_sample_float(float *output, const float *input, const float *grid,
int *output_dims, int *input_dims, int *grid_dims,
int nb_dims, GridSamplerInterpolation interp,
GridSamplerPadding padding, bool align_corners,
cudaStream_t stream) {
grid_sample<float>(output, input, grid, output_dims, input_dims, grid_dims,
nb_dims, interp, padding, align_corners, stream);
}
// Copyright (c) OpenMMLab. All rights reserved
// Modified from:
// https://github.com/NVIDIA/TensorRT/blob/master/plugin/instanceNormalizationPlugin/instanceNormalizationPlugin.cpp
#include "trt_instance_norm.hpp"
#include <cuda_fp16.h>
#include <stdexcept>
#include "trt_serialize.hpp"
using namespace nvinfer1;
cudnnStatus_t convert_trt2cudnn_dtype(nvinfer1::DataType trt_dtype,
cudnnDataType_t* cudnn_dtype) {
switch (trt_dtype) {
case nvinfer1::DataType::kFLOAT:
*cudnn_dtype = CUDNN_DATA_FLOAT;
break;
case nvinfer1::DataType::kHALF:
*cudnn_dtype = CUDNN_DATA_HALF;
break;
default:
return CUDNN_STATUS_BAD_PARAM;
}
return CUDNN_STATUS_SUCCESS;
}
namespace {
constexpr const char* PLUGIN_VERSION{"1"};
constexpr const char* PLUGIN_NAME{"MMCVInstanceNormalization"};
} // namespace
PluginFieldCollection InstanceNormalizationDynamicCreator::mFC{};
std::vector<PluginField> InstanceNormalizationDynamicCreator::mPluginAttributes;
InstanceNormalizationDynamic::InstanceNormalizationDynamic(
const std::string& name, float epsilon)
: mLayerName(name), mEpsilon(epsilon) {}
InstanceNormalizationDynamic::InstanceNormalizationDynamic(
const std::string& name, void const* serialData, size_t serialLength)
: mLayerName(name) {
deserialize_value(&serialData, &serialLength, &mEpsilon);
}
InstanceNormalizationDynamic::~InstanceNormalizationDynamic() {}
// InstanceNormalizationDynamic returns one output.
int InstanceNormalizationDynamic::getNbOutputs() const { return 1; }
DimsExprs InstanceNormalizationDynamic::getOutputDimensions(
int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
nvinfer1::IExprBuilder& exprBuilder) {
nvinfer1::DimsExprs output(inputs[0]);
return output;
}
int InstanceNormalizationDynamic::initialize() { return 0; }
void InstanceNormalizationDynamic::terminate() {}
size_t InstanceNormalizationDynamic::getWorkspaceSize(
const nvinfer1::PluginTensorDesc* inputs, int nbInputs,
const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const {
int n = inputs[0].dims.d[0];
int c = inputs[0].dims.d[1];
int elem_size = mmcv::getElementSize(inputs[1].type);
return mmcv::getAlignedSize(n * c * elem_size) * 2;
}
int InstanceNormalizationDynamic::enqueue(
const nvinfer1::PluginTensorDesc* inputDesc,
const nvinfer1::PluginTensorDesc* outputDesc, const void* const* inputs,
void* const* outputs, void* workspace, cudaStream_t stream) {
nvinfer1::Dims input_dims = inputDesc[0].dims;
int n = input_dims.d[0];
int c = input_dims.d[1];
int h = input_dims.d[2];
int w = input_dims.nbDims > 3 ? input_dims.d[3] : 1;
int elem_size = mmcv::getElementSize(inputDesc[1].type);
void* n_scales = (void*)workspace;
void* n_bias = (void*)(workspace + mmcv::getAlignedSize(n * c * elem_size));
const void* scales = (const void*)inputs[1];
const void* bias = (const void*)inputs[2];
for (int i = 0; i < n; ++i) {
cudaMemcpyAsync(n_scales + i * c * elem_size, scales, c * elem_size,
cudaMemcpyDeviceToDevice, stream);
cudaMemcpyAsync(n_bias + i * c * elem_size, bias, c * elem_size,
cudaMemcpyDeviceToDevice, stream);
}
cudnnSetTensor4dDescriptor(_b_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1,
n * c, 1, 1);
cudnnDataType_t cudnn_dtype{};
convert_trt2cudnn_dtype(inputDesc[0].type, &cudnn_dtype);
cudnnSetTensor4dDescriptor(_x_desc, CUDNN_TENSOR_NCHW, cudnn_dtype, 1, n * c,
h, w);
cudnnSetTensor4dDescriptor(_y_desc, CUDNN_TENSOR_NCHW, cudnn_dtype, 1, n * c,
h, w);
float alpha = 1;
float beta = 0;
void const* x_ptr = inputs[0];
void* y_ptr = outputs[0];
cudnnSetStream(_cudnn_handle, stream);
// Note: Use of CUDNN_BATCHNORM_SPATIAL_PERSISTENT can cause numerical
// overflows (NaNs) for fp32 data in some circumstances. The lower-
// performance CUDNN_BATCHNORM_SPATIAL should be used if this is not
// acceptable.
cudnnBatchNormalizationForwardTraining(
_cudnn_handle, CUDNN_BATCHNORM_SPATIAL_PERSISTENT, &alpha, &beta, _x_desc,
x_ptr, _y_desc, y_ptr, _b_desc, n_scales, n_bias, 1., nullptr, nullptr,
mEpsilon, nullptr, nullptr);
return 0;
}
size_t InstanceNormalizationDynamic::getSerializationSize() const {
return serialized_size(mEpsilon);
}
void InstanceNormalizationDynamic::serialize(void* buffer) const {
serialize_value(&buffer, mEpsilon);
}
bool InstanceNormalizationDynamic::supportsFormatCombination(
int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs,
int nbOutputs) {
return ((inOut[pos].type == nvinfer1::DataType::kFLOAT ||
inOut[pos].type == nvinfer1::DataType::kHALF) &&
inOut[pos].format == nvinfer1::PluginFormat::kLINEAR &&
inOut[pos].type == inOut[0].type);
}
const char* InstanceNormalizationDynamic::getPluginType() const {
return PLUGIN_NAME;
}
const char* InstanceNormalizationDynamic::getPluginVersion() const {
return PLUGIN_VERSION;
}
void InstanceNormalizationDynamic::destroy() { delete this; }
IPluginV2DynamicExt* InstanceNormalizationDynamic::clone() const {
auto* plugin = new InstanceNormalizationDynamic{mLayerName, mEpsilon};
plugin->setPluginNamespace(mPluginNamespace.c_str());
return plugin;
}
// Set plugin namespace
void InstanceNormalizationDynamic::setPluginNamespace(
const char* pluginNamespace) {
mPluginNamespace = pluginNamespace;
}
const char* InstanceNormalizationDynamic::getPluginNamespace() const {
return mPluginNamespace.c_str();
}
nvinfer1::DataType InstanceNormalizationDynamic::getOutputDataType(
int index, const nvinfer1::DataType* inputTypes, int nbInputs) const {
return inputTypes[0];
}
// Attach the plugin object to an execution context and grant the plugin the
// access to some context resource.
void InstanceNormalizationDynamic::attachToContext(
cudnnContext* cudnnContext, cublasContext* cublasContext,
IGpuAllocator* gpuAllocator) {
_cudnn_handle = cudnnContext;
cudnnCreateTensorDescriptor(&_b_desc);
cudnnCreateTensorDescriptor(&_x_desc);
cudnnCreateTensorDescriptor(&_y_desc);
}
// Detach the plugin object from its execution context.
void InstanceNormalizationDynamic::detachFromContext() {
cudnnDestroyTensorDescriptor(_y_desc);
cudnnDestroyTensorDescriptor(_x_desc);
cudnnDestroyTensorDescriptor(_b_desc);
}
void InstanceNormalizationDynamic::configurePlugin(
const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs,
const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) {}
// InstanceNormalizationDynamicCreator methods
InstanceNormalizationDynamicCreator::InstanceNormalizationDynamicCreator() {
mPluginAttributes.clear();
mPluginAttributes.emplace_back(
PluginField("epsilon", nullptr, PluginFieldType::kFLOAT32, 1));
mFC.nbFields = mPluginAttributes.size();
mFC.fields = mPluginAttributes.data();
}
const char* InstanceNormalizationDynamicCreator::getPluginName() const {
return PLUGIN_NAME;
}
const char* InstanceNormalizationDynamicCreator::getPluginVersion() const {
return PLUGIN_VERSION;
}
const PluginFieldCollection*
InstanceNormalizationDynamicCreator::getFieldNames() {
return &mFC;
}
IPluginV2DynamicExt* InstanceNormalizationDynamicCreator::createPlugin(
const char* name, const nvinfer1::PluginFieldCollection* fc) {
float epsilon = 1e-5;
const PluginField* fields = fc->fields;
for (int i = 0; i < fc->nbFields; ++i) {
const char* attrName = fields[i].name;
if (!strcmp(attrName, "epsilon")) {
epsilon = *(static_cast<const float*>(fields[i].data));
}
}
InstanceNormalizationDynamic* obj =
new InstanceNormalizationDynamic(name, epsilon);
obj->setPluginNamespace(mNamespace.c_str());
return obj;
}
IPluginV2DynamicExt* InstanceNormalizationDynamicCreator::deserializePlugin(
const char* name, const void* serialData, size_t serialLength) {
InstanceNormalizationDynamic* obj =
new InstanceNormalizationDynamic{name, serialData, serialLength};
obj->setPluginNamespace(mNamespace.c_str());
return obj;
}
void InstanceNormalizationDynamicCreator::setPluginNamespace(
const char* libNamespace) {
mNamespace = libNamespace;
}
const char* InstanceNormalizationDynamicCreator::getPluginNamespace() const {
return mNamespace.c_str();
}
// Copyright (c) OpenMMLab. All rights reserved
#include "trt_modulated_deform_conv.hpp"
#include <assert.h>
#include <chrono>
#include "trt_serialize.hpp"
void ModulatedDeformConvForwardCUDAKernelLauncher_float(
const float *input, const float *weight, const float *bias,
const float *offset, const float *mask, float *output, void *workspace,
int batch, int channels, int height, int width, int channels_out,
int kernel_w, int kernel_h, int stride_w, int stride_h, int pad_w,
int pad_h, int dilation_w, int dilation_h, int group, int deformable_group,
int im2col_step, cublasHandle_t cublas_handle, cudaStream_t stream);
namespace {
static const char *PLUGIN_VERSION{"1"};
static const char *PLUGIN_NAME{"MMCVModulatedDeformConv2d"};
} // namespace
nvinfer1::PluginFieldCollection
ModulatedDeformableConvPluginDynamicCreator::mFC{};
std::vector<nvinfer1::PluginField>
ModulatedDeformableConvPluginDynamicCreator::mPluginAttributes;
ModulatedDeformableConvPluginDynamic::ModulatedDeformableConvPluginDynamic(
const std::string &name, const nvinfer1::Dims stride,
const nvinfer1::Dims padding, const nvinfer1::Dims dilation,
const int deformableGroup, const int group)
: mLayerName(name),
mStride(stride),
mPadding(padding),
mDilation(dilation),
mDeformableGroup(deformableGroup),
mGroup(group) {
mWithBias = false;
}
ModulatedDeformableConvPluginDynamic::ModulatedDeformableConvPluginDynamic(
const std::string name, const void *data, size_t length)
: mLayerName(name) {
deserialize_value(&data, &length, &mStride);
deserialize_value(&data, &length, &mPadding);
deserialize_value(&data, &length, &mDilation);
deserialize_value(&data, &length, &mDeformableGroup);
deserialize_value(&data, &length, &mGroup);
mWithBias = false;
}
ModulatedDeformableConvPluginDynamic::~ModulatedDeformableConvPluginDynamic() {}
nvinfer1::IPluginV2DynamicExt *ModulatedDeformableConvPluginDynamic::clone()
const {
ModulatedDeformableConvPluginDynamic *plugin =
new ModulatedDeformableConvPluginDynamic(
mLayerName, mStride, mPadding, mDilation, mDeformableGroup, mGroup);
plugin->setPluginNamespace(getPluginNamespace());
return plugin;
}
nvinfer1::DimsExprs ModulatedDeformableConvPluginDynamic::getOutputDimensions(
int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
nvinfer1::IExprBuilder &exprBuilder) {
nvinfer1::DimsExprs ret;
ret.nbDims = 4;
ret.d[0] = inputs[0].d[0];
ret.d[1] = inputs[3].d[0];
ret.d[2] = inputs[1].d[2];
ret.d[3] = inputs[1].d[3];
return ret;
}
bool ModulatedDeformableConvPluginDynamic::supportsFormatCombination(
int pos, const nvinfer1::PluginTensorDesc *inOut, int nbInputs,
int nbOutputs) {
if (pos == 0) {
return (inOut[pos].type == nvinfer1::DataType::kFLOAT &&
inOut[pos].format == nvinfer1::TensorFormat::kLINEAR);
} else {
return inOut[pos].type == inOut[0].type &&
inOut[pos].format == inOut[0].format;
}
}
void ModulatedDeformableConvPluginDynamic::configurePlugin(
const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
const nvinfer1::DynamicPluginTensorDesc *outputs, int nbOutputs) {
if (nbInputs == 5) {
mWithBias = true;
}
}
size_t ModulatedDeformableConvPluginDynamic::getWorkspaceSize(
const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const {
int sizeof_dtype = mmcv::getElementSize(outputs[0].type);
int batch_size = inputs[0].dims.d[0];
int nInputPlane = inputs[0].dims.d[1];
int inputHeight = inputs[0].dims.d[2];
int inputWidth = inputs[0].dims.d[3];
int nOutputPlane = outputs[0].dims.d[1];
int outputHeight = outputs[0].dims.d[2];
int outputWidth = outputs[0].dims.d[3];
int kW = inputs[3].dims.d[2];
int kH = inputs[3].dims.d[3];
int im2col_step = std::min(32, batch_size);
size_t col_size = mmcv::getAlignedSize(nInputPlane * kW * kH * outputHeight *
outputWidth * sizeof_dtype);
return col_size;
}
int ModulatedDeformableConvPluginDynamic::enqueue(
const nvinfer1::PluginTensorDesc *inputDesc,
const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
void *const *outputs, void *workSpace, cudaStream_t stream) {
int batch = inputDesc[0].dims.d[0];
int channels = inputDesc[0].dims.d[1];
int height = inputDesc[0].dims.d[2];
int width = inputDesc[0].dims.d[3];
int channels_out = outputDesc[0].dims.d[1];
int kernel_h = inputDesc[3].dims.d[2];
int kernel_w = inputDesc[3].dims.d[3];
const void *x = inputs[0];
const void *offset = inputs[1];
const void *mask = inputs[2];
const void *weight = inputs[3];
const void *bias = mWithBias ? inputs[4] : nullptr;
void *output = outputs[0];
int im2col_step = std::min(batch, 32);
// TODO: add fp16 support
auto data_type = inputDesc[0].type;
switch (data_type) {
case nvinfer1::DataType::kFLOAT:
ModulatedDeformConvForwardCUDAKernelLauncher_float(
(float *)x, (float *)weight, (float *)bias, (float *)offset,
(float *)mask, (float *)output, workSpace, batch, channels, height,
width, channels_out, kernel_w, kernel_h, mStride.d[0], mStride.d[1],
mPadding.d[0], mPadding.d[1], mDilation.d[0], mDilation.d[1], mGroup,
mDeformableGroup, im2col_step, m_cublas_handle, stream);
break;
default:
return 1;
break;
}
return 0;
}
nvinfer1::DataType ModulatedDeformableConvPluginDynamic::getOutputDataType(
int index, const nvinfer1::DataType *inputTypes, int nbInputs) const {
return inputTypes[0];
}
// IPluginV2 Methods
const char *ModulatedDeformableConvPluginDynamic::getPluginType() const {
return PLUGIN_NAME;
}
const char *ModulatedDeformableConvPluginDynamic::getPluginVersion() const {
return PLUGIN_VERSION;
}
int ModulatedDeformableConvPluginDynamic::getNbOutputs() const { return 1; }
int ModulatedDeformableConvPluginDynamic::initialize() { return 0; }
void ModulatedDeformableConvPluginDynamic::terminate() {}
size_t ModulatedDeformableConvPluginDynamic::getSerializationSize() const {
return sizeof(mStride) + sizeof(mPadding) + sizeof(mDilation) +
sizeof(mDeformableGroup) + sizeof(mGroup);
}
void ModulatedDeformableConvPluginDynamic::serialize(void *buffer) const {
serialize_value(&buffer, mStride);
serialize_value(&buffer, mPadding);
serialize_value(&buffer, mDilation);
serialize_value(&buffer, mDeformableGroup);
serialize_value(&buffer, mGroup);
}
void ModulatedDeformableConvPluginDynamic::destroy() {
// This gets called when the network containing plugin is destroyed
delete this;
}
void ModulatedDeformableConvPluginDynamic::attachToContext(
cudnnContext *cudnnContext, cublasContext *cublasContext,
nvinfer1::IGpuAllocator *gpuAllocator) {
m_cublas_handle = cublasContext;
}
void ModulatedDeformableConvPluginDynamic::detachFromContext() {}
void ModulatedDeformableConvPluginDynamic::setPluginNamespace(
const char *libNamespace) {
mNamespace = libNamespace;
}
const char *ModulatedDeformableConvPluginDynamic::getPluginNamespace() const {
return mNamespace.c_str();
}
////////////////////// creator /////////////////////////////
ModulatedDeformableConvPluginDynamicCreator::
ModulatedDeformableConvPluginDynamicCreator() {
mPluginAttributes.emplace_back(nvinfer1::PluginField("stride"));
mPluginAttributes.emplace_back(nvinfer1::PluginField("padding"));
mPluginAttributes.emplace_back(nvinfer1::PluginField("dilation"));
mPluginAttributes.emplace_back(nvinfer1::PluginField("groups"));
mPluginAttributes.emplace_back(nvinfer1::PluginField("deform_groups"));
mFC.nbFields = mPluginAttributes.size();
mFC.fields = mPluginAttributes.data();
}
const char *ModulatedDeformableConvPluginDynamicCreator::getPluginName() const {
return PLUGIN_NAME;
}
const char *ModulatedDeformableConvPluginDynamicCreator::getPluginVersion()
const {
return PLUGIN_VERSION;
}
const nvinfer1::PluginFieldCollection *
ModulatedDeformableConvPluginDynamicCreator::getFieldNames() {
return &mFC;
}
nvinfer1::IPluginV2 *ModulatedDeformableConvPluginDynamicCreator::createPlugin(
const char *name, const nvinfer1::PluginFieldCollection *fc) {
nvinfer1::Dims stride{2, {1, 1}};
nvinfer1::Dims padding{2, {0, 0}};
nvinfer1::Dims dilation{2, {1, 1}};
int deformableGroup = 1;
int group = 1;
for (int i = 0; i < fc->nbFields; i++) {
if (fc->fields[i].data == nullptr) {
continue;
}
std::string field_name(fc->fields[i].name);
if (field_name.compare("deform_groups") == 0) {
deformableGroup = static_cast<const int *>(fc->fields[i].data)[0];
}
if (field_name.compare("group") == 0) {
group = static_cast<const int *>(fc->fields[i].data)[0];
}
if (field_name.compare("stride") == 0) {
stride.nbDims = 2;
stride.d[0] = static_cast<const int *>(fc->fields[i].data)[0];
stride.d[1] = static_cast<const int *>(fc->fields[i].data)[1];
}
if (field_name.compare("padding") == 0) {
padding.nbDims = 2;
padding.d[0] = static_cast<const int *>(fc->fields[i].data)[0];
padding.d[1] = static_cast<const int *>(fc->fields[i].data)[1];
}
if (field_name.compare("dilation") == 0) {
dilation.nbDims = 2;
dilation.d[0] = static_cast<const int *>(fc->fields[i].data)[0];
dilation.d[1] = static_cast<const int *>(fc->fields[i].data)[1];
}
}
ModulatedDeformableConvPluginDynamic *plugin =
new ModulatedDeformableConvPluginDynamic(name, stride, padding, dilation,
deformableGroup, group);
plugin->setPluginNamespace(getPluginNamespace());
return plugin;
}
nvinfer1::IPluginV2 *
ModulatedDeformableConvPluginDynamicCreator::deserializePlugin(
const char *name, const void *serialData, size_t serialLength) {
auto plugin =
new ModulatedDeformableConvPluginDynamic(name, serialData, serialLength);
plugin->setPluginNamespace(getPluginNamespace());
return plugin;
}
void ModulatedDeformableConvPluginDynamicCreator::setPluginNamespace(
const char *libNamespace) {
mNamespace = libNamespace;
}
const char *ModulatedDeformableConvPluginDynamicCreator::getPluginNamespace()
const {
return mNamespace.c_str();
}
// Copyright (c) OpenMMLab. All rights reserved
#include <assert.h>
#include <cuda_fp16.h>
#include "common_cuda_helper.hpp"
#include "modulated_deform_conv_cuda_kernel.cuh"
#include "trt_cuda_helper.cuh"
#include "trt_plugin_helper.hpp"
template <typename T>
void trt_modulated_deformable_im2col(
const T* data_im_, const T* data_offset_, const T* data_mask_,
const int batch_size, const int channels, const int height_im,
const int width_im, const int height_col, const int width_col,
const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w, const int dilation_h,
const int dilation_w, const int deformable_group, T* data_col_,
cudaStream_t stream) {
// num_axes should be smaller than block size
const int channel_per_deformable_group = channels / deformable_group;
const int num_kernels = channels * batch_size * height_col * width_col;
modulated_deformable_im2col_gpu_kernel<T>
<<<GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>(
num_kernels, data_im_, data_offset_, data_mask_, height_im, width_im,
kernel_h, kenerl_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
dilation_w, channel_per_deformable_group, batch_size, channels,
deformable_group, height_col, width_col, data_col_);
cudaCheckError();
}
template <typename scalar_t>
__global__ void output_add_bias_kernel(scalar_t* output, const scalar_t* bias,
size_t step_batch, size_t step_channel,
size_t n) {
CUDA_1D_KERNEL_LOOP(index, n) {
output[index] += bias[(index % step_batch) / step_channel];
}
}
template <typename scalar_t>
static void output_add_bias(scalar_t* output, const scalar_t* bias,
size_t batch, size_t channel, size_t height,
size_t width, cudaStream_t stream) {
size_t step_channel = height * width;
size_t step_batch = step_channel * channel;
size_t n = step_batch * batch;
output_add_bias_kernel<<<GET_BLOCKS(n), THREADS_PER_BLOCK, 0, stream>>>(
output, bias, step_batch, step_channel, n);
}
template <typename scalar_t>
void ModulatedDeformConvForwardCUDAKernelLauncher(
const scalar_t* input, const scalar_t* weight, const scalar_t* bias,
const scalar_t* offset, const scalar_t* mask, scalar_t* output,
void* workspace, int batch, int channels, int height, int width,
int channels_out, int kernel_w, int kernel_h, int stride_w, int stride_h,
int pad_w, int pad_h, int dilation_w, int dilation_h, int group,
int deformable_group, int im2col_step, cublasHandle_t cublas_handle,
cudaStream_t stream) {
size_t sizeof_dtype = sizeof(scalar_t);
bool with_bias = (bias != nullptr);
im2col_step = std::min(int(batch), im2col_step);
assert(batch % im2col_step == 0);
const int channels_kernel = channels / group;
const int height_out =
(height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
const int width_out =
(width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
scalar_t* columns = (scalar_t*)workspace;
const size_t input_step = channels * height * width;
const size_t offset_step =
deformable_group * kernel_h * kernel_w * 2 * height * width;
const size_t mask_step =
deformable_group * kernel_h * kernel_w * height * width;
const size_t out_step = channels_out * height_out * width_out;
const size_t out_group_step = out_step / group;
const size_t col_g_step =
channels * kernel_w * kernel_h / group * height_out * width_out;
const size_t weight_g_step =
channels_out / group * channels / group * kernel_h * kernel_w;
const int m = channels_out / group;
const int n = height_out * width_out;
const int k = channels / group * kernel_h * kernel_w;
scalar_t alpha = 1.;
scalar_t beta = 0.;
for (int b = 0; b < batch; b++) {
const scalar_t* input_start = input + b * input_step;
const scalar_t* offset_start = offset + b * offset_step;
const scalar_t* mask_start = mask + b * mask_step;
trt_modulated_deformable_im2col<scalar_t>(
input_start, offset_start, mask_start, 1, channels, height, width,
height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,
stride_w, dilation_h, dilation_w, deformable_group, columns, stream);
for (int g = 0; g < group; g++) {
const scalar_t* weight_start = weight + g * weight_g_step;
scalar_t* col_start = columns + g * col_g_step;
scalar_t* out_buffer_start = output + b * out_step + g * out_group_step;
// cudaMemsetAsync(out_buffer_start, 0, 1, stream);
cublasGemmWrap<scalar_t>(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k,
&alpha, col_start, n, weight_start, k, &beta,
out_buffer_start, n);
cudaCheckError();
}
}
if (with_bias) {
output_add_bias<scalar_t>(output, bias, batch, channels_out, height_out,
width_out, stream);
}
}
void ModulatedDeformConvForwardCUDAKernelLauncher_float(
const float* input, const float* weight, const float* bias,
const float* offset, const float* mask, float* output, void* workspace,
int batch, int channels, int height, int width, int channels_out,
int kernel_w, int kernel_h, int stride_w, int stride_h, int pad_w,
int pad_h, int dilation_w, int dilation_h, int group, int deformable_group,
int im2col_step, cublasHandle_t cublas_handle, cudaStream_t stream) {
ModulatedDeformConvForwardCUDAKernelLauncher<float>(
input, weight, bias, offset, mask, output, workspace, batch, channels,
height, width, channels_out, kernel_w, kernel_h, stride_w, stride_h,
pad_w, pad_h, dilation_w, dilation_h, group, deformable_group,
im2col_step, cublas_handle, stream);
}
// Copyright (c) OpenMMLab. All rights reserved
#include "trt_nms.hpp"
#include <assert.h>
#include <stdio.h>
#include <chrono>
#include "trt_serialize.hpp"
extern size_t get_onnxnms_workspace_size(
size_t num_batches, size_t spatial_dimension, size_t num_classes,
size_t boxes_word_size, int center_point_box, size_t output_length);
extern void TRTNMSCUDAKernelLauncher_float(
const float *boxes, const float *scores,
const int max_output_boxes_per_class, const float iou_threshold,
const float score_threshold, const int offset, int *output,
int center_point_box, int num_batches, int spatial_dimension,
int num_classes, size_t output_length, void *workspace,
cudaStream_t stream);
namespace {
static const char *PLUGIN_VERSION{"1"};
static const char *PLUGIN_NAME{"NonMaxSuppression"};
} // namespace
nvinfer1::PluginFieldCollection NonMaxSuppressionDynamicCreator::mFC{};
std::vector<nvinfer1::PluginField>
NonMaxSuppressionDynamicCreator::mPluginAttributes;
NonMaxSuppressionDynamic::NonMaxSuppressionDynamic(
const std::string &name, int centerPointBox, int maxOutputBoxesPerClass,
float iouThreshold, float scoreThreshold, int offset)
: mLayerName(name),
mCenterPointBox(centerPointBox),
mMaxOutputBoxesPerClass(maxOutputBoxesPerClass),
mIouThreshold(iouThreshold),
mScoreThreshold(scoreThreshold),
mOffset(offset) {}
NonMaxSuppressionDynamic::NonMaxSuppressionDynamic(const std::string name,
const void *data,
size_t length)
: mLayerName(name) {
deserialize_value(&data, &length, &mCenterPointBox);
deserialize_value(&data, &length, &mMaxOutputBoxesPerClass);
deserialize_value(&data, &length, &mIouThreshold);
deserialize_value(&data, &length, &mScoreThreshold);
deserialize_value(&data, &length, &mOffset);
}
nvinfer1::IPluginV2DynamicExt *NonMaxSuppressionDynamic::clone() const {
NonMaxSuppressionDynamic *plugin = new NonMaxSuppressionDynamic(
mLayerName, mCenterPointBox, mMaxOutputBoxesPerClass, mIouThreshold,
mScoreThreshold, mOffset);
plugin->setPluginNamespace(getPluginNamespace());
return plugin;
}
nvinfer1::DimsExprs NonMaxSuppressionDynamic::getOutputDimensions(
int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
nvinfer1::IExprBuilder &exprBuilder) {
nvinfer1::DimsExprs ret;
ret.nbDims = 2;
auto num_batches = inputs[0].d[0];
auto spatial_dimension = inputs[0].d[1];
if (mMaxOutputBoxesPerClass > 0) {
spatial_dimension = exprBuilder.operation(
nvinfer1::DimensionOperation::kMIN, *spatial_dimension,
*exprBuilder.constant(mMaxOutputBoxesPerClass));
}
auto num_classes = inputs[1].d[1];
ret.d[0] = exprBuilder.operation(
nvinfer1::DimensionOperation::kPROD, *num_batches,
*exprBuilder.operation(nvinfer1::DimensionOperation::kPROD,
*spatial_dimension, *num_classes));
ret.d[1] = exprBuilder.constant(3);
return ret;
}
bool NonMaxSuppressionDynamic::supportsFormatCombination(
int pos, const nvinfer1::PluginTensorDesc *inOut, int nbInputs,
int nbOutputs) {
if (pos < nbInputs) {
switch (pos) {
case 0:
// boxes
return inOut[pos].type == nvinfer1::DataType::kFLOAT &&
inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
case 1:
// scores
return inOut[pos].type == nvinfer1::DataType::kFLOAT &&
inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
default:
return true;
}
} else {
switch (pos - nbInputs) {
case 0:
// selected_indices
return inOut[pos].type == nvinfer1::DataType::kINT32 &&
inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
default:
return true;
}
}
return true;
}
void NonMaxSuppressionDynamic::configurePlugin(
const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
const nvinfer1::DynamicPluginTensorDesc *outputs, int nbOutputs) {}
size_t NonMaxSuppressionDynamic::getWorkspaceSize(
const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const {
size_t boxes_word_size = mmcv::getElementSize(inputs[0].type);
size_t num_batches = inputs[0].dims.d[0];
size_t spatial_dimension = inputs[0].dims.d[1];
size_t num_classes = inputs[1].dims.d[1];
size_t output_length = outputs[0].dims.d[0];
return get_onnxnms_workspace_size(num_batches, spatial_dimension, num_classes,
boxes_word_size, mCenterPointBox,
output_length);
}
int NonMaxSuppressionDynamic::enqueue(
const nvinfer1::PluginTensorDesc *inputDesc,
const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
void *const *outputs, void *workSpace, cudaStream_t stream) {
int num_batches = inputDesc[0].dims.d[0];
int spatial_dimension = inputDesc[0].dims.d[1];
int num_classes = inputDesc[1].dims.d[1];
int output_length = outputDesc[0].dims.d[0];
const float *boxes = (const float *)inputs[0];
const float *scores = (const float *)inputs[1];
int *output = (int *)outputs[0];
TRTNMSCUDAKernelLauncher_float(
boxes, scores, mMaxOutputBoxesPerClass, mIouThreshold, mScoreThreshold,
mOffset, output, mCenterPointBox, num_batches, spatial_dimension,
num_classes, output_length, workSpace, stream);
return 0;
}
nvinfer1::DataType NonMaxSuppressionDynamic::getOutputDataType(
int index, const nvinfer1::DataType *inputTypes, int nbInputs) const {
return nvinfer1::DataType::kINT32;
}
// IPluginV2 Methods
const char *NonMaxSuppressionDynamic::getPluginType() const {
return PLUGIN_NAME;
}
const char *NonMaxSuppressionDynamic::getPluginVersion() const {
return PLUGIN_VERSION;
}
int NonMaxSuppressionDynamic::getNbOutputs() const { return 1; }
int NonMaxSuppressionDynamic::initialize() { return 0; }
void NonMaxSuppressionDynamic::terminate() {}
size_t NonMaxSuppressionDynamic::getSerializationSize() const {
return sizeof(mCenterPointBox) + sizeof(mMaxOutputBoxesPerClass) +
sizeof(mIouThreshold) + sizeof(mScoreThreshold) + sizeof(mOffset);
}
void NonMaxSuppressionDynamic::serialize(void *buffer) const {
serialize_value(&buffer, mCenterPointBox);
serialize_value(&buffer, mMaxOutputBoxesPerClass);
serialize_value(&buffer, mIouThreshold);
serialize_value(&buffer, mScoreThreshold);
serialize_value(&buffer, mOffset);
}
void NonMaxSuppressionDynamic::destroy() {
// This gets called when the network containing plugin is destroyed
delete this;
}
void NonMaxSuppressionDynamic::setPluginNamespace(const char *libNamespace) {
mNamespace = libNamespace;
}
const char *NonMaxSuppressionDynamic::getPluginNamespace() const {
return mNamespace.c_str();
}
////////////////////// creator /////////////////////////////
NonMaxSuppressionDynamicCreator::NonMaxSuppressionDynamicCreator() {
mPluginAttributes.clear();
mPluginAttributes.emplace_back(nvinfer1::PluginField("center_point_box"));
mPluginAttributes.emplace_back(
nvinfer1::PluginField("max_output_boxes_per_class"));
mPluginAttributes.emplace_back(nvinfer1::PluginField("iou_threshold"));
mPluginAttributes.emplace_back(nvinfer1::PluginField("score_threshold"));
mPluginAttributes.emplace_back(nvinfer1::PluginField("offset"));
mFC.nbFields = mPluginAttributes.size();
mFC.fields = mPluginAttributes.data();
}
const char *NonMaxSuppressionDynamicCreator::getPluginName() const {
return PLUGIN_NAME;
}
const char *NonMaxSuppressionDynamicCreator::getPluginVersion() const {
return PLUGIN_VERSION;
}
const nvinfer1::PluginFieldCollection *
NonMaxSuppressionDynamicCreator::getFieldNames() {
return &mFC;
}
nvinfer1::IPluginV2 *NonMaxSuppressionDynamicCreator::createPlugin(
const char *name, const nvinfer1::PluginFieldCollection *fc) {
int centerPointBox = 0;
int maxOutputBoxesPerClass = 0;
float iouThreshold = 0.0f;
float scoreThreshold = 0.0f;
int offset = 0;
for (int i = 0; i < fc->nbFields; i++) {
if (fc->fields[i].data == nullptr) {
continue;
}
std::string field_name(fc->fields[i].name);
if (field_name.compare("center_point_box") == 0) {
centerPointBox = static_cast<const int *>(fc->fields[i].data)[0];
}
if (field_name.compare("max_output_boxes_per_class") == 0) {
maxOutputBoxesPerClass = static_cast<const int *>(fc->fields[i].data)[0];
}
if (field_name.compare("iou_threshold") == 0) {
iouThreshold = static_cast<const float *>(fc->fields[i].data)[0];
}
if (field_name.compare("score_threshold") == 0) {
scoreThreshold = static_cast<const float *>(fc->fields[i].data)[0];
}
if (field_name.compare("offset") == 0) {
offset = static_cast<const int *>(fc->fields[i].data)[0];
}
}
NonMaxSuppressionDynamic *plugin =
new NonMaxSuppressionDynamic(name, centerPointBox, maxOutputBoxesPerClass,
iouThreshold, scoreThreshold, offset);
plugin->setPluginNamespace(getPluginNamespace());
return plugin;
}
nvinfer1::IPluginV2 *NonMaxSuppressionDynamicCreator::deserializePlugin(
const char *name, const void *serialData, size_t serialLength) {
auto plugin = new NonMaxSuppressionDynamic(name, serialData, serialLength);
plugin->setPluginNamespace(getPluginNamespace());
return plugin;
}
void NonMaxSuppressionDynamicCreator::setPluginNamespace(
const char *libNamespace) {
mNamespace = libNamespace;
}
const char *NonMaxSuppressionDynamicCreator::getPluginNamespace() const {
return mNamespace.c_str();
}
// Copyright (c) OpenMMLab. All rights reserved
#include <stdio.h>
#include <thrust/execution_policy.h>
#include <thrust/gather.h>
#include <thrust/sort.h>
#include <thrust/transform.h>
#include <chrono>
#include <thread>
#include <vector>
#include "common_cuda_helper.hpp"
#include "nms_cuda_kernel.cuh"
#include "trt_cuda_helper.cuh"
#include "trt_plugin_helper.hpp"
struct NMSBox {
float box[4];
};
struct nms_centerwh2xyxy {
__host__ __device__ NMSBox operator()(const NMSBox box) {
NMSBox out;
out.box[0] = box.box[0] - box.box[2] / 2.0f;
out.box[1] = box.box[1] - box.box[3] / 2.0f;
out.box[2] = box.box[0] + box.box[2] / 2.0f;
out.box[3] = box.box[1] + box.box[3] / 2.0f;
return out;
}
};
struct nms_sbox_idle {
const float* idle_box_;
__host__ __device__ nms_sbox_idle(const float* idle_box) {
idle_box_ = idle_box;
}
__host__ __device__ NMSBox operator()(const NMSBox box) {
return {idle_box_[0], idle_box_[1], idle_box_[2], idle_box_[3]};
}
};
struct nms_score_threshold {
float score_threshold_;
__host__ __device__ nms_score_threshold(const float score_threshold) {
score_threshold_ = score_threshold;
}
__host__ __device__ bool operator()(const float score) {
return score < score_threshold_;
}
};
__global__ void nms_reindex_kernel(int n, int* output, int* index_cache) {
CUDA_1D_KERNEL_LOOP(index, n) {
const int old_index = output[index * 3 + 2];
output[index * 3 + 2] = index_cache[old_index];
}
}
__global__ void mask_to_output_kernel(const unsigned long long* dev_mask,
const int* index, int* output,
int* output_count, int batch_id,
int cls_id, int spatial_dimension,
int col_blocks,
int max_output_boxes_per_class) {
extern __shared__ unsigned long long remv[];
// fill remv with 0
CUDA_1D_KERNEL_LOOP(i, col_blocks) { remv[i] = 0; }
__syncthreads();
int start = *output_count;
int out_per_class_count = 0;
for (int i = 0; i < spatial_dimension; i++) {
const int nblock = i / threadsPerBlock;
const int inblock = i % threadsPerBlock;
if (!(remv[nblock] & (1ULL << inblock))) {
if (threadIdx.x == 0) {
output[start * 3 + 0] = batch_id;
output[start * 3 + 1] = cls_id;
output[start * 3 + 2] = index[i];
start += 1;
}
out_per_class_count += 1;
if (out_per_class_count >= max_output_boxes_per_class) {
break;
}
__syncthreads();
// set every overlap box with bit 1 in remv
const unsigned long long* p = dev_mask + i * col_blocks;
CUDA_1D_KERNEL_LOOP(j, col_blocks) {
if (j >= nblock) {
remv[j] |= p[j];
}
} // j
__syncthreads();
}
} // i
if (threadIdx.x == 0) {
*output_count = start;
}
}
size_t get_onnxnms_workspace_size(size_t num_batches, size_t spatial_dimension,
size_t num_classes, size_t boxes_word_size,
int center_point_box, size_t output_length) {
size_t boxes_xyxy_workspace = 0;
if (center_point_box == 1) {
boxes_xyxy_workspace = mmcv::getAlignedSize(
num_batches * spatial_dimension * 4 * boxes_word_size);
}
size_t scores_workspace =
mmcv::getAlignedSize(spatial_dimension * boxes_word_size);
size_t boxes_workspace =
mmcv::getAlignedSize(spatial_dimension * 4 * boxes_word_size);
const int col_blocks =
(spatial_dimension + threadsPerBlock - 1) / threadsPerBlock;
size_t mask_workspace = mmcv::getAlignedSize(spatial_dimension * col_blocks *
sizeof(unsigned long long));
size_t index_template_workspace =
mmcv::getAlignedSize(spatial_dimension * sizeof(int));
size_t index_workspace =
mmcv::getAlignedSize(spatial_dimension * sizeof(int));
size_t count_workspace = mmcv::getAlignedSize(sizeof(int));
return scores_workspace + boxes_xyxy_workspace + boxes_workspace +
mask_workspace + index_template_workspace + index_workspace +
count_workspace;
}
/**
* Launch the NonMaxSuppression kernel
*
* The NMS will be performed on each batch/class, share the kernel implement
* `nms_cuda`. For each batch/class, the `boxes_sorted` and `index_cache` will
* be sorted by scores, boxes_sorted will be used in `nms_cuda` kernel. After
* that, the output would be generated by `mask_to_output_kernel` with
* `dev_mask` and `sorted_cache`.
*
* @param[in] bboxes with shape [num_batch, spatial_dimension, 4], input boxes
* @param[in] scores with shape [num_batch, num_classes, spatial_dimension],
* input scores
* @param[in] max_output_boxes_per_class max output boxes per class
* @param[in] iou_threshold threshold of iou
* @param[in] score_threshold threshold of scores
* @param[in] offset box offset, only 0 or 1 is valid
* @param[out] output with shape [output_length, 3], each row contain index
* (batch_id, class_id, boxes_id), filling -1 if result is not valid.
* @param[in] center_point_box 0 if boxes is [left, top, right, bottom] 1 if
* boxes is [center_x, center_y, width, height]
* @param[in] num_batches batch size of boxes and scores
* @param[in] spatial_dimension boxes numbers each batch
* @param[in] num_classes class numbers
* @param[in] output_length the max output rows
* @param[in] workspace memory for all temporary variables.
* @param[in] stream cuda stream
*/
void TRTNMSCUDAKernelLauncher_float(const float* boxes, const float* scores,
const int max_output_boxes_per_class,
const float iou_threshold,
const float score_threshold,
const int offset, int* output,
int center_point_box, int num_batches,
int spatial_dimension, int num_classes,
size_t output_length, void* workspace,
cudaStream_t stream) {
const int col_blocks =
(spatial_dimension + threadsPerBlock - 1) / threadsPerBlock;
float* boxes_sorted = (float*)workspace;
workspace = static_cast<char*>(workspace) +
mmcv::getAlignedSize(spatial_dimension * 4 * sizeof(float));
float* boxes_xyxy = nullptr;
if (center_point_box == 1) {
boxes_xyxy = (float*)workspace;
workspace = static_cast<char*>(workspace) +
mmcv::getAlignedSize(num_batches * spatial_dimension * 4 *
sizeof(float));
thrust::transform(thrust::cuda::par.on(stream), (NMSBox*)boxes,
(NMSBox*)(boxes + num_batches * spatial_dimension * 4),
(NMSBox*)boxes_xyxy, nms_centerwh2xyxy());
cudaCheckError();
}
float* scores_sorted = (float*)workspace;
workspace = static_cast<char*>(workspace) +
mmcv::getAlignedSize(spatial_dimension * sizeof(float));
unsigned long long* dev_mask = (unsigned long long*)workspace;
workspace = static_cast<char*>(workspace) +
mmcv::getAlignedSize(spatial_dimension * col_blocks *
sizeof(unsigned long long));
int* index_cache = (int*)workspace;
workspace = static_cast<char*>(workspace) +
mmcv::getAlignedSize(spatial_dimension * sizeof(int));
// generate sequence [0,1,2,3,4 ....]
int* index_template = (int*)workspace;
workspace = static_cast<char*>(workspace) +
mmcv::getAlignedSize(spatial_dimension * sizeof(int));
thrust::sequence(thrust::cuda::par.on(stream), index_template,
index_template + spatial_dimension, 0);
int max_output_boxes_per_class_cpu = max_output_boxes_per_class;
if (max_output_boxes_per_class_cpu <= 0) {
max_output_boxes_per_class_cpu = spatial_dimension;
}
int* output_count = (int*)workspace;
workspace = static_cast<char*>(workspace) + mmcv::getAlignedSize(sizeof(int));
cudaMemsetAsync(output_count, 0, sizeof(int), stream);
// fill output with -1
thrust::fill(thrust::cuda::par.on(stream), output, output + output_length * 3,
-1);
cudaCheckError();
dim3 blocks(col_blocks, col_blocks);
dim3 threads(threadsPerBlock);
for (int batch_id = 0; batch_id < num_batches; ++batch_id) {
for (int cls_id = 0; cls_id < num_classes; ++cls_id) {
const int batch_cls_id = batch_id * num_classes + cls_id;
// sort boxes by score
cudaMemcpyAsync(scores_sorted, scores + batch_cls_id * spatial_dimension,
spatial_dimension * sizeof(float),
cudaMemcpyDeviceToDevice, stream);
cudaCheckError();
cudaMemcpyAsync(index_cache, index_template,
spatial_dimension * sizeof(int), cudaMemcpyDeviceToDevice,
stream);
cudaCheckError();
thrust::sort_by_key(thrust::cuda::par.on(stream), scores_sorted,
scores_sorted + spatial_dimension, index_cache,
thrust::greater<float>());
if (center_point_box == 1) {
thrust::gather(thrust::cuda::par.on(stream), index_cache,
index_cache + spatial_dimension,
(NMSBox*)(boxes_xyxy + batch_id * spatial_dimension * 4),
(NMSBox*)boxes_sorted);
} else {
thrust::gather(thrust::cuda::par.on(stream), index_cache,
index_cache + spatial_dimension,
(NMSBox*)(boxes + batch_id * spatial_dimension * 4),
(NMSBox*)boxes_sorted);
}
cudaCheckError();
if (score_threshold > 0.0f) {
thrust::transform_if(
thrust::cuda::par.on(stream), (NMSBox*)boxes_sorted,
(NMSBox*)(boxes_sorted + spatial_dimension * 4), scores_sorted,
(NMSBox*)boxes_sorted, nms_sbox_idle(boxes_sorted),
nms_score_threshold(score_threshold));
}
nms_cuda<<<blocks, threads, 0, stream>>>(spatial_dimension, iou_threshold,
offset, boxes_sorted, dev_mask);
// will be performed when dev_mask is full.
mask_to_output_kernel<<<1, threadsPerBlock,
col_blocks * sizeof(unsigned long long),
stream>>>(
dev_mask, index_cache, output, output_count, batch_id, cls_id,
spatial_dimension, col_blocks, max_output_boxes_per_class_cpu);
} // cls_id
} // batch_id
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment