Commit 546b4279 authored by limm's avatar limm
Browse files

add csrc and mmdeploy module

parent 502f4fb9
Pipeline #2810 canceled with stages
// Copyright (c) OpenMMLab. All rights reserved.
#ifndef TRT_SCATTERND_HPP
#define TRT_SCATTERND_HPP
#include <cublas_v2.h>
#include <memory>
#include <string>
#include <vector>
#include "trt_plugin_base.hpp"
namespace mmdeploy {
class GatherTopk : public TRTPluginBase {
public:
GatherTopk(const std::string &name);
GatherTopk(const std::string name, const void *data, size_t length);
GatherTopk() = delete;
// IPluginV2DynamicExt Methods
nvinfer1::IPluginV2DynamicExt *clone() const TRT_NOEXCEPT override;
nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs,
int nbInputs, nvinfer1::IExprBuilder &exprBuilder)
TRT_NOEXCEPT override;
bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *ioDesc, int nbInputs,
int nbOutputs) TRT_NOEXCEPT override;
void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs,
const nvinfer1::DynamicPluginTensorDesc *out,
int nbOutputs) TRT_NOEXCEPT override;
size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
const nvinfer1::PluginTensorDesc *outputs,
int nbOutputs) const TRT_NOEXCEPT override;
int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
void *const *outputs, void *workspace, cudaStream_t stream) TRT_NOEXCEPT override;
// IPluginV2Ext Methods
nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType *inputTypes,
int nbInputs) const TRT_NOEXCEPT override;
// IPluginV2 Methods
const char *getPluginType() const TRT_NOEXCEPT override;
const char *getPluginVersion() const TRT_NOEXCEPT override;
int getNbOutputs() const TRT_NOEXCEPT override;
size_t getSerializationSize() const TRT_NOEXCEPT override;
void serialize(void *buffer) const TRT_NOEXCEPT override;
};
class GatherTopkCreator : public TRTPluginCreatorBase {
public:
GatherTopkCreator();
const char *getPluginName() const TRT_NOEXCEPT override;
const char *getPluginVersion() const TRT_NOEXCEPT override;
nvinfer1::IPluginV2 *createPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
TRT_NOEXCEPT override;
nvinfer1::IPluginV2 *deserializePlugin(const char *name, const void *serialData,
size_t serialLength) TRT_NOEXCEPT override;
};
} // namespace mmdeploy
#endif // TRT_SCATTERND_HPP
// Copyright (c) OpenMMLab. All rights reserved.
#include <functional>
#include <numeric>
#include <vector>
#include "common_cuda_helper.hpp"
#include "gather_topk_kernel.hpp"
#include "trt_plugin_helper.hpp"
template <typename scalar_t>
__global__ void gather_topk_kernel(const scalar_t* input, const int* indices, scalar_t* output,
int batch, int num_input, int num_indices, int channel) {
CUDA_1D_KERNEL_LOOP(index, batch * num_indices * channel) {
const int b_id = index / (num_indices * channel);
const int n_id = (index / channel) % num_indices;
const int c_id = index % channel;
const int input_n_id = indices[b_id * num_indices + n_id];
const scalar_t value = input[b_id * num_input * channel + input_n_id * channel + c_id];
output[b_id * num_indices * channel + n_id * channel + c_id] = value;
}
}
template <typename scalar_t>
void gather_topk_impl(const scalar_t* input, const int* indices, const int* dims, int nbDims,
const int* indices_dims, int indice_nbDims, scalar_t* output,
cudaStream_t stream) {
int batch = 1;
for (int i = 0; i < indice_nbDims - 1; ++i) batch *= dims[i];
int num_input = dims[indice_nbDims - 1];
int num_indices = indices_dims[indice_nbDims - 1];
int channel = 1;
for (int i = indice_nbDims; i < nbDims; ++i) channel *= dims[i];
const int col_block = DIVUP(batch * num_indices * channel, THREADS_PER_BLOCK);
gather_topk_kernel<<<col_block, THREADS_PER_BLOCK, 0, stream>>>(input, indices, output, batch,
num_input, num_indices, channel);
}
template void gather_topk_impl<float>(const float* input, const int* indices, const int* dims,
int nbDims, const int* indices_dims, int indice_nbDims,
float* output, cudaStream_t stream);
template void gather_topk_impl<int32_t>(const int32_t* input, const int* indices, const int* dims,
int nbDims, const int* indices_dims, int indice_nbDims,
int32_t* output, cudaStream_t stream);
// Copyright (c) OpenMMLab. All rights reserved.
#ifndef TRT_GRID_SAMPLER_KERNEL_HPP
#define TRT_GRID_SAMPLER_KERNEL_HPP
#include <cuda_runtime.h>
template <typename scalar_t>
void gather_topk_impl(const scalar_t* input, const int* indices, const int* dims, int nbDims,
const int* indices_dims, int indice_nbDims, scalar_t* output,
cudaStream_t stream);
#endif // TRT_GRID_SAMPLER_KERNEL_HPP
// Copyright (c) OpenMMLab. All rights reserved
#include "trt_grid_priors.hpp"
#include <assert.h>
#include <chrono>
#include "trt_grid_priors_kernel.hpp"
#include "trt_serialize.hpp"
using namespace nvinfer1;
namespace mmdeploy {
namespace {
static const char *PLUGIN_VERSION{"1"};
static const char *PLUGIN_NAME{"GridPriorsTRT"};
} // namespace
GridPriorsTRT::GridPriorsTRT(const std::string &name, const nvinfer1::Dims stride)
: TRTPluginBase(name), mStride(stride) {}
GridPriorsTRT::GridPriorsTRT(const std::string name, const void *data, size_t length)
: TRTPluginBase(name) {
deserialize_value(&data, &length, &mStride);
}
GridPriorsTRT::~GridPriorsTRT() {}
nvinfer1::IPluginV2DynamicExt *GridPriorsTRT::clone() const TRT_NOEXCEPT {
GridPriorsTRT *plugin = new GridPriorsTRT(mLayerName, mStride);
plugin->setPluginNamespace(getPluginNamespace());
return plugin;
}
nvinfer1::DimsExprs GridPriorsTRT::getOutputDimensions(
int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
nvinfer1::IExprBuilder &exprBuilder) TRT_NOEXCEPT {
// input[0] == base_anchor
// input[1] == empty_h
// input[2] == empty_w
nvinfer1::DimsExprs ret;
ret.nbDims = 2;
auto area =
exprBuilder.operation(nvinfer1::DimensionOperation::kPROD, *inputs[2].d[0], *inputs[1].d[0]);
ret.d[0] = exprBuilder.operation(nvinfer1::DimensionOperation::kPROD, *area, *(inputs[0].d[0]));
ret.d[1] = exprBuilder.constant(4);
return ret;
}
bool GridPriorsTRT::supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *ioDesc,
int nbInputs, int nbOutputs) TRT_NOEXCEPT {
if (pos == 0) {
return (ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR);
} else if (pos - nbInputs == 0) {
return ioDesc[pos].type == ioDesc[0].type && ioDesc[pos].format == ioDesc[0].format;
} else {
return true;
}
}
int GridPriorsTRT::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
void *const *outputs, void *workSpace,
cudaStream_t stream) TRT_NOEXCEPT {
int num_base_anchors = inputDesc[0].dims.d[0];
int feat_h = inputDesc[1].dims.d[0];
int feat_w = inputDesc[2].dims.d[0];
const void *base_anchor = inputs[0];
void *output = outputs[0];
auto data_type = inputDesc[0].type;
switch (data_type) {
case nvinfer1::DataType::kFLOAT:
trt_grid_priors_impl<float>((float *)base_anchor, (float *)output, num_base_anchors, feat_w,
feat_h, mStride.d[0], mStride.d[1], stream);
break;
default:
return 1;
}
return 0;
}
nvinfer1::DataType GridPriorsTRT::getOutputDataType(int index, const nvinfer1::DataType *inputTypes,
int nbInputs) const TRT_NOEXCEPT {
return inputTypes[0];
}
// IPluginV2 Methods
const char *GridPriorsTRT::getPluginType() const TRT_NOEXCEPT { return PLUGIN_NAME; }
const char *GridPriorsTRT::getPluginVersion() const TRT_NOEXCEPT { return PLUGIN_VERSION; }
int GridPriorsTRT::getNbOutputs() const TRT_NOEXCEPT { return 1; }
size_t GridPriorsTRT::getSerializationSize() const TRT_NOEXCEPT { return serialized_size(mStride); }
void GridPriorsTRT::serialize(void *buffer) const TRT_NOEXCEPT {
serialize_value(&buffer, mStride);
;
}
////////////////////// creator /////////////////////////////
GridPriorsTRTCreator::GridPriorsTRTCreator() {
mPluginAttributes.clear();
mPluginAttributes.emplace_back(nvinfer1::PluginField("stride_h"));
mPluginAttributes.emplace_back(nvinfer1::PluginField("stride_w"));
mFC.nbFields = mPluginAttributes.size();
mFC.fields = mPluginAttributes.data();
}
const char *GridPriorsTRTCreator::getPluginName() const TRT_NOEXCEPT { return PLUGIN_NAME; }
const char *GridPriorsTRTCreator::getPluginVersion() const TRT_NOEXCEPT { return PLUGIN_VERSION; }
nvinfer1::IPluginV2 *GridPriorsTRTCreator::createPlugin(
const char *name, const nvinfer1::PluginFieldCollection *fc) TRT_NOEXCEPT {
int stride_w = 1;
int stride_h = 1;
for (int i = 0; i < fc->nbFields; i++) {
if (fc->fields[i].data == nullptr) {
continue;
}
std::string field_name(fc->fields[i].name);
if (field_name.compare("stride_w") == 0) {
stride_w = static_cast<const int *>(fc->fields[i].data)[0];
}
if (field_name.compare("stride_h") == 0) {
stride_h = static_cast<const int *>(fc->fields[i].data)[0];
}
}
nvinfer1::Dims stride{2, {stride_w, stride_h}};
GridPriorsTRT *plugin = new GridPriorsTRT(name, stride);
plugin->setPluginNamespace(getPluginNamespace());
return plugin;
}
nvinfer1::IPluginV2 *GridPriorsTRTCreator::deserializePlugin(const char *name,
const void *serialData,
size_t serialLength) TRT_NOEXCEPT {
auto plugin = new GridPriorsTRT(name, serialData, serialLength);
plugin->setPluginNamespace(getPluginNamespace());
return plugin;
}
REGISTER_TENSORRT_PLUGIN(GridPriorsTRTCreator);
} // namespace mmdeploy
// Copyright (c) OpenMMLab. All rights reserved.
#ifndef TRT_GRID_PRIORS_HPP
#define TRT_GRID_PRIORS_HPP
#include <cublas_v2.h>
#include <memory>
#include <string>
#include <vector>
#include "trt_plugin_base.hpp"
namespace mmdeploy {
class GridPriorsTRT : public TRTPluginBase {
public:
GridPriorsTRT(const std::string &name, const nvinfer1::Dims stride);
GridPriorsTRT(const std::string name, const void *data, size_t length);
GridPriorsTRT() = delete;
~GridPriorsTRT() TRT_NOEXCEPT override;
// IPluginV2DynamicExt Methods
nvinfer1::IPluginV2DynamicExt *clone() const TRT_NOEXCEPT override;
nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs,
int nbInputs, nvinfer1::IExprBuilder &exprBuilder)
TRT_NOEXCEPT override;
bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *ioDesc, int nbInputs,
int nbOutputs) TRT_NOEXCEPT override;
int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
void *const *outputs, void *workspace, cudaStream_t stream) TRT_NOEXCEPT override;
// IPluginV2Ext Methods
nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType *inputTypes,
int nbInputs) const TRT_NOEXCEPT override;
// IPluginV2 Methods
const char *getPluginType() const TRT_NOEXCEPT override;
const char *getPluginVersion() const TRT_NOEXCEPT override;
int getNbOutputs() const TRT_NOEXCEPT override;
size_t getSerializationSize() const TRT_NOEXCEPT override;
void serialize(void *buffer) const TRT_NOEXCEPT override;
private:
nvinfer1::Dims mStride;
cublasHandle_t m_cublas_handle;
};
class GridPriorsTRTCreator : public TRTPluginCreatorBase {
public:
GridPriorsTRTCreator();
const char *getPluginName() const TRT_NOEXCEPT override;
const char *getPluginVersion() const TRT_NOEXCEPT override;
nvinfer1::IPluginV2 *createPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
TRT_NOEXCEPT override;
nvinfer1::IPluginV2 *deserializePlugin(const char *name, const void *serialData,
size_t serialLength) TRT_NOEXCEPT override;
};
} // namespace mmdeploy
#endif // TRT_GRID_PRIORS_HPP
// Copyright (c) OpenMMLab. All rights reserved
#include <cuda_fp16.h>
#include "common_cuda_helper.hpp"
#include "trt_grid_priors_kernel.hpp"
#include "trt_plugin_helper.hpp"
template <typename scalar_t>
__global__ void trt_grid_priors_kernel(const scalar_t* base_anchor, scalar_t* output,
int num_base_anchors, int feat_w, int feat_h, int stride_w,
int stride_h) {
// load base anchor into shared memory.
extern __shared__ scalar_t shared_base_anchor[];
for (int i = threadIdx.x; i < num_base_anchors * 4; i += blockDim.x) {
shared_base_anchor[i] = base_anchor[i];
}
__syncthreads();
CUDA_1D_KERNEL_LOOP(index, num_base_anchors * feat_w * feat_h) {
const int a_offset = (index % num_base_anchors) << 2;
const scalar_t w = scalar_t(((index / num_base_anchors) % feat_w) * stride_w);
const scalar_t h = scalar_t((index / (feat_w * num_base_anchors)) * stride_h);
auto out_start = output + index * 4;
out_start[0] = shared_base_anchor[a_offset] + w;
out_start[1] = shared_base_anchor[a_offset + 1] + h;
out_start[2] = shared_base_anchor[a_offset + 2] + w;
out_start[3] = shared_base_anchor[a_offset + 3] + h;
}
}
template <typename scalar_t>
void trt_grid_priors_impl(const scalar_t* base_anchor, scalar_t* output, int num_base_anchors,
int feat_w, int feat_h, int stride_w, int stride_h, cudaStream_t stream) {
trt_grid_priors_kernel<<<GET_BLOCKS(num_base_anchors * feat_w * feat_h), THREADS_PER_BLOCK,
DIVUP(num_base_anchors * 4, 32) * 32 * sizeof(scalar_t), stream>>>(
base_anchor, output, (int)num_base_anchors, (int)feat_w, (int)feat_h, (int)stride_w,
(int)stride_h);
}
template void trt_grid_priors_impl<float>(const float* base_anchor, float* output,
int num_base_anchors, int feat_w, int feat_h,
int stride_w, int stride_h, cudaStream_t stream);
// Copyright (c) OpenMMLab. All rights reserved
#ifndef TRT_GRID_PRIORS_KERNEL_HPP
#define TRT_GRID_PRIORS_KERNEL_HPP
#include <cuda_runtime.h>
template <typename scalar_t>
void trt_grid_priors_impl(const scalar_t* base_anchor, scalar_t* output, int num_base_anchors,
int feat_w, int feat_h, int stride_w, int stride_h, cudaStream_t stream);
#endif
// Copyright (c) OpenMMLab. All rights reserved.
#include "trt_grid_sampler.hpp"
#include <assert.h>
#include <chrono>
#include "trt_grid_sampler_kernel.hpp"
#include "trt_plugin_helper.hpp"
#include "trt_serialize.hpp"
namespace mmdeploy {
namespace {
static const char *PLUGIN_VERSION{"1"};
static const char *PLUGIN_NAME{"grid_sampler"};
} // namespace
TRTGridSampler::TRTGridSampler(const std::string &name, int mode, int paddingMode,
bool alignCorners)
: TRTPluginBase(name), mMode(mode), mPaddingMode(paddingMode), mAlignCorners(alignCorners) {}
TRTGridSampler::TRTGridSampler(const std::string name, const void *data, size_t length)
: TRTPluginBase(name) {
deserialize_value(&data, &length, &mMode);
deserialize_value(&data, &length, &mPaddingMode);
deserialize_value(&data, &length, &mAlignCorners);
}
nvinfer1::IPluginV2DynamicExt *TRTGridSampler::clone() const TRT_NOEXCEPT {
TRTGridSampler *plugin = new TRTGridSampler(mLayerName, mMode, mPaddingMode, mAlignCorners);
plugin->setPluginNamespace(getPluginNamespace());
return plugin;
}
nvinfer1::DimsExprs TRTGridSampler::getOutputDimensions(
int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
nvinfer1::IExprBuilder &exprBuilder) TRT_NOEXCEPT {
nvinfer1::DimsExprs ret;
ret.nbDims = inputs[0].nbDims;
ret.d[0] = inputs[0].d[0];
ret.d[1] = inputs[0].d[1];
for (int i = 2; i < ret.nbDims; ++i) {
ret.d[i] = inputs[1].d[i - 1];
}
return ret;
}
bool TRTGridSampler::supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *ioDesc,
int nbInputs, int nbOutputs) TRT_NOEXCEPT {
if (pos == 0) {
return (ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR);
} else {
return ioDesc[pos].type == ioDesc[0].type && ioDesc[pos].format == ioDesc[0].format;
}
}
void TRTGridSampler::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
const nvinfer1::DynamicPluginTensorDesc *outputs,
int nbOutputs) TRT_NOEXCEPT {
// Validate input arguments
}
size_t TRTGridSampler::getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
const nvinfer1::PluginTensorDesc *outputs,
int nbOutputs) const TRT_NOEXCEPT {
return 0;
}
int TRTGridSampler::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
void *const *outputs, void *workSpace,
cudaStream_t stream) TRT_NOEXCEPT {
nvinfer1::Dims input_dims = inputDesc[0].dims;
nvinfer1::Dims grid_dims = inputDesc[1].dims;
nvinfer1::Dims output_dims = outputDesc[0].dims;
GridSamplerInterpolation interp_mode = GridSamplerInterpolation::Bilinear;
switch (mMode) {
case 0:
interp_mode = GridSamplerInterpolation::Bilinear;
break;
case 1:
interp_mode = GridSamplerInterpolation::Nearest;
break;
default:
break;
}
GridSamplerPadding padding_mode = GridSamplerPadding::Zeros;
switch (mPaddingMode) {
case 0:
padding_mode = GridSamplerPadding::Zeros;
break;
case 1:
padding_mode = GridSamplerPadding::Border;
break;
case 2:
padding_mode = GridSamplerPadding::Reflection;
break;
default:
break;
}
auto data_type = inputDesc[0].type;
switch (data_type) {
case nvinfer1::DataType::kFLOAT:
grid_sample<float>((float *)outputs[0], (float *)inputs[0], (float *)inputs[1],
&(output_dims.d[0]), &(input_dims.d[0]), &(grid_dims.d[0]),
input_dims.nbDims, interp_mode, padding_mode, mAlignCorners, stream);
break;
default:
return 1;
break;
}
return 0;
}
nvinfer1::DataType TRTGridSampler::getOutputDataType(int index,
const nvinfer1::DataType *inputTypes,
int nbInputs) const TRT_NOEXCEPT {
return inputTypes[0];
}
// IPluginV2 Methods
const char *TRTGridSampler::getPluginType() const TRT_NOEXCEPT { return PLUGIN_NAME; }
const char *TRTGridSampler::getPluginVersion() const TRT_NOEXCEPT { return PLUGIN_VERSION; }
int TRTGridSampler::getNbOutputs() const TRT_NOEXCEPT { return 1; }
size_t TRTGridSampler::getSerializationSize() const TRT_NOEXCEPT {
return serialized_size(mMode) + serialized_size(mPaddingMode) + serialized_size(mAlignCorners);
}
void TRTGridSampler::serialize(void *buffer) const TRT_NOEXCEPT {
serialize_value(&buffer, mMode);
serialize_value(&buffer, mPaddingMode);
serialize_value(&buffer, mAlignCorners);
}
////////////////////// creator /////////////////////////////
TRTGridSamplerCreator::TRTGridSamplerCreator() {
mPluginAttributes = std::vector<nvinfer1::PluginField>(
{nvinfer1::PluginField("interpolation_mode"), nvinfer1::PluginField("padding_mode"),
nvinfer1::PluginField("align_corners")});
mFC.nbFields = mPluginAttributes.size();
mFC.fields = mPluginAttributes.data();
}
const char *TRTGridSamplerCreator::getPluginName() const TRT_NOEXCEPT { return PLUGIN_NAME; }
const char *TRTGridSamplerCreator::getPluginVersion() const TRT_NOEXCEPT { return PLUGIN_VERSION; }
nvinfer1::IPluginV2 *TRTGridSamplerCreator::createPlugin(
const char *name, const nvinfer1::PluginFieldCollection *fc) TRT_NOEXCEPT {
int mode = 0;
int paddingMode = 0;
bool alignCorners = false;
for (int i = 0; i < fc->nbFields; i++) {
if (fc->fields[i].data == nullptr) {
continue;
}
std::string field_name(fc->fields[i].name);
if (field_name.compare("interpolation_mode") == 0) {
mode = static_cast<const int *>(fc->fields[i].data)[0];
}
if (field_name.compare("padding_mode") == 0) {
paddingMode = static_cast<const int *>(fc->fields[i].data)[0];
}
if (field_name.compare("align_corners") == 0) {
alignCorners = (bool)(static_cast<const int *>(fc->fields[i].data)[0]);
}
}
TRTGridSampler *plugin = new TRTGridSampler(name, mode, paddingMode, alignCorners);
plugin->setPluginNamespace(getPluginNamespace());
return plugin;
}
nvinfer1::IPluginV2 *TRTGridSamplerCreator::deserializePlugin(const char *name,
const void *serialData,
size_t serialLength) TRT_NOEXCEPT {
// This object will be deleted when the network is destroyed, which will
// call FCPluginDynamic::destroy()
auto plugin = new TRTGridSampler(name, serialData, serialLength);
plugin->setPluginNamespace(getPluginNamespace());
return plugin;
}
REGISTER_TENSORRT_PLUGIN(TRTGridSamplerCreator);
} // namespace mmdeploy
// Copyright (c) OpenMMLab. All rights reserved.
#ifndef TRT_GRID_SAMPLER_HPP
#define TRT_GRID_SAMPLER_HPP
#include <cublas_v2.h>
#include <memory>
#include <string>
#include <vector>
#include "trt_plugin_base.hpp"
namespace mmdeploy {
class TRTGridSampler : public TRTPluginBase {
public:
TRTGridSampler(const std::string &name, int mode, int paddingMode, bool alignCorners);
TRTGridSampler(const std::string name, const void *data, size_t length);
TRTGridSampler() = delete;
~TRTGridSampler() TRT_NOEXCEPT override = default;
// IPluginV2DynamicExt Methods
nvinfer1::IPluginV2DynamicExt *clone() const TRT_NOEXCEPT override;
nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs,
int nbInputs, nvinfer1::IExprBuilder &exprBuilder)
TRT_NOEXCEPT override;
bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *ioDesc, int nbInputs,
int nbOutputs) TRT_NOEXCEPT override;
void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs,
const nvinfer1::DynamicPluginTensorDesc *out,
int nbOutputs) TRT_NOEXCEPT override;
size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
const nvinfer1::PluginTensorDesc *outputs,
int nbOutputs) const TRT_NOEXCEPT override;
int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
void *const *outputs, void *workspace, cudaStream_t stream) TRT_NOEXCEPT override;
// IPluginV2Ext Methods
nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType *inputTypes,
int nbInputs) const TRT_NOEXCEPT override;
// IPluginV2 Methods
const char *getPluginType() const TRT_NOEXCEPT override;
const char *getPluginVersion() const TRT_NOEXCEPT override;
int getNbOutputs() const TRT_NOEXCEPT override;
size_t getSerializationSize() const TRT_NOEXCEPT override;
void serialize(void *buffer) const TRT_NOEXCEPT override;
private:
int mMode;
int mPaddingMode;
bool mAlignCorners;
};
class TRTGridSamplerCreator : public TRTPluginCreatorBase {
public:
TRTGridSamplerCreator();
~TRTGridSamplerCreator() TRT_NOEXCEPT override = default;
const char *getPluginName() const TRT_NOEXCEPT override;
const char *getPluginVersion() const TRT_NOEXCEPT override;
nvinfer1::IPluginV2 *createPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
TRT_NOEXCEPT override;
nvinfer1::IPluginV2 *deserializePlugin(const char *name, const void *serialData,
size_t serialLength) TRT_NOEXCEPT override;
};
} // namespace mmdeploy
#endif // TRT_GRID_SAMPLER_HPP
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
// modified from
// https://github.com/pytorch/pytorch/blob/ec683299ebabf297a3504c76248d37be830e4342/aten/src/ATen/native/cuda/GridSampler.cuh
// and
// https://github.com/pytorch/pytorch/blob/ec683299ebabf297a3504c76248d37be830e4342/aten/src/ATen/native/cuda/GridSampler.cu
#include <cuda_fp16.h>
#include <stdio.h>
#include <algorithm>
#include <cmath>
#include <vector>
#include "common_cuda_helper.hpp"
#include "trt_grid_sampler_kernel.hpp"
#include "trt_plugin_helper.hpp"
using mmdeploy::TensorDesc;
// Unnormalizes a coordinate from the -1 to +1 scale to its pixel index value,
// where we view each pixel as an area between (idx - 0.5) and (idx + 0.5).
// if align_corners: -1 and +1 get sent to the centers of the corner pixels
// -1 --> 0
// +1 --> (size - 1)
// scale_factor = (size - 1) / 2
// if not align_corners: -1 and +1 get sent to the image edges
// -1 --> -0.5
// +1 --> (size - 1) + 0.5 == size - 0.5
// scale_factor = size / 2
template <typename scalar_t>
static __forceinline__ __device__ scalar_t grid_sampler_unnormalize(scalar_t coord, int size,
bool align_corners) {
if (align_corners) {
// unnormalize coord from [-1, 1] to [0, size - 1]
return ((coord + 1.f) / 2) * (size - 1);
} else {
// unnormalize coord from [-1, 1] to [-0.5, size - 0.5]
return ((coord + 1.f) * size - 1) / 2;
}
}
// Clips coordinates to between 0 and clip_limit - 1
template <typename scalar_t>
static __forceinline__ __device__ scalar_t clip_coordinates(scalar_t in, int clip_limit) {
return ::min(static_cast<scalar_t>(clip_limit - 1), ::max(in, static_cast<scalar_t>(0)));
}
// Reflects coordinates until they fall between low and high (inclusive).
// The bounds are passed as twice their value so that half-integer values
// can be represented as ints.
template <typename scalar_t>
static __forceinline__ __device__ scalar_t reflect_coordinates(scalar_t in, int twice_low,
int twice_high) {
if (twice_low == twice_high) {
return static_cast<scalar_t>(0);
}
scalar_t min = static_cast<scalar_t>(twice_low) / 2;
scalar_t span = static_cast<scalar_t>(twice_high - twice_low) / 2;
in = ::fabs(in - min);
// `fmod` returns same sign as `in`, which is positive after the `fabs` above.
scalar_t extra = ::fmod(in, span);
int flips = static_cast<int>(::floor(in / span));
if (flips % 2 == 0) {
return extra + min;
} else {
return span - extra + min;
}
}
template <typename scalar_t>
static __forceinline__ __device__ scalar_t safe_downgrade_to_int_range(scalar_t x) {
// -100.0 does not have special meaning. This is just to make sure
// it's not within_bounds_2d or within_bounds_3d, and does not cause
// undefined behavior. See #35506.
if (x > INT_MAX - 1 || x < INT_MIN || !::isfinite(static_cast<double>(x)))
return static_cast<scalar_t>(-100.0);
return x;
}
// Computes the pixel source index value for a grid coordinate
template <typename scalar_t>
static __forceinline__ __device__ scalar_t grid_sampler_compute_source_index(
scalar_t coord, int size, GridSamplerPadding padding_mode, bool align_corners) {
coord = grid_sampler_unnormalize(coord, size, align_corners);
if (padding_mode == GridSamplerPadding::Border) {
// clip coordinates to image borders
coord = clip_coordinates(coord, size);
} else if (padding_mode == GridSamplerPadding::Reflection) {
// reflect coordinates by image borders
if (align_corners) {
coord = reflect_coordinates(coord, 0, 2 * (size - 1));
} else {
coord = reflect_coordinates(coord, -1, 2 * size - 1);
}
// clip coordinates to image borders
coord = clip_coordinates(coord, size);
}
coord = safe_downgrade_to_int_range(coord);
return coord;
}
static __forceinline__ __device__ bool within_bounds_2d(int h, int w, int H, int W) {
return h >= 0 && h < H && w >= 0 && w < W;
}
static __forceinline__ __device__ bool within_bounds_3d(int d, int h, int w, int D, int H, int W) {
return d >= 0 && d < D && h >= 0 && h < H && w >= 0 && w < W;
}
template <typename scalar_t>
__global__ void grid_sampler_2d_kernel(const int nthreads, const scalar_t *input,
const scalar_t *grid, scalar_t *output,
TensorDesc input_desc, TensorDesc grid_desc,
TensorDesc output_desc,
const GridSamplerInterpolation interpolation_mode,
const GridSamplerPadding padding_mode, bool align_corners) {
int C = input_desc.shape[1];
int inp_H = input_desc.shape[2];
int inp_W = input_desc.shape[3];
int out_H = grid_desc.shape[1];
int out_W = grid_desc.shape[2];
int inp_sN = input_desc.stride[0];
int inp_sC = input_desc.stride[1];
int inp_sH = input_desc.stride[2];
int inp_sW = input_desc.stride[3];
int grid_sN = grid_desc.stride[0];
int grid_sH = grid_desc.stride[1];
int grid_sW = grid_desc.stride[2];
int grid_sCoor = grid_desc.stride[3];
int out_sN = output_desc.stride[0];
int out_sC = output_desc.stride[1];
int out_sH = output_desc.stride[2];
int out_sW = output_desc.stride[3];
CUDA_1D_KERNEL_LOOP(index, nthreads) {
const int w = index % out_W;
const int h = (index / out_W) % out_H;
const int n = index / (out_H * out_W);
const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
// get the corresponding input x, y coordinates from grid
scalar_t ix = grid[grid_offset];
scalar_t iy = grid[grid_offset + grid_sCoor];
ix = grid_sampler_compute_source_index(ix, inp_W, padding_mode, align_corners);
iy = grid_sampler_compute_source_index(iy, inp_H, padding_mode, align_corners);
if (interpolation_mode == GridSamplerInterpolation::Bilinear) {
// get NE, NW, SE, SW pixel values from (x, y)
int ix_nw = static_cast<int>(::floor(ix));
int iy_nw = static_cast<int>(::floor(iy));
int ix_ne = ix_nw + 1;
int iy_ne = iy_nw;
int ix_sw = ix_nw;
int iy_sw = iy_nw + 1;
int ix_se = ix_nw + 1;
int iy_se = iy_nw + 1;
// get surfaces to each neighbor:
scalar_t nw = (ix_se - ix) * (iy_se - iy);
scalar_t ne = (ix - ix_sw) * (iy_sw - iy);
scalar_t sw = (ix_ne - ix) * (iy - iy_ne);
scalar_t se = (ix - ix_nw) * (iy - iy_nw);
// calculate bilinear weighted pixel value and set output pixel
auto inp_ptr_NC = input + n * inp_sN;
auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW;
for (int c = 0; c < C; ++c, inp_ptr_NC += inp_sC, out_ptr_NCHW += out_sC) {
*out_ptr_NCHW = static_cast<scalar_t>(0);
if (within_bounds_2d(iy_nw, ix_nw, inp_H, inp_W)) {
*out_ptr_NCHW += inp_ptr_NC[iy_nw * inp_sH + ix_nw * inp_sW] * nw;
}
if (within_bounds_2d(iy_ne, ix_ne, inp_H, inp_W)) {
*out_ptr_NCHW += inp_ptr_NC[iy_ne * inp_sH + ix_ne * inp_sW] * ne;
}
if (within_bounds_2d(iy_sw, ix_sw, inp_H, inp_W)) {
*out_ptr_NCHW += inp_ptr_NC[iy_sw * inp_sH + ix_sw * inp_sW] * sw;
}
if (within_bounds_2d(iy_se, ix_se, inp_H, inp_W)) {
*out_ptr_NCHW += inp_ptr_NC[iy_se * inp_sH + ix_se * inp_sW] * se;
}
}
} else if (interpolation_mode == GridSamplerInterpolation::Nearest) {
int ix_nearest = static_cast<int>(::round(ix));
int iy_nearest = static_cast<int>(::round(iy));
// assign nearest neighbor pixel value to output pixel
auto inp_ptr_NC = input + n * inp_sN;
auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW;
for (int c = 0; c < C; ++c, inp_ptr_NC += inp_sC, out_ptr_NCHW += out_sC) {
if (within_bounds_2d(iy_nearest, ix_nearest, inp_H, inp_W)) {
*out_ptr_NCHW = inp_ptr_NC[iy_nearest * inp_sH + ix_nearest * inp_sW];
} else {
*out_ptr_NCHW = static_cast<scalar_t>(0);
}
}
}
}
}
template <typename scalar_t>
__global__ void grid_sampler_3d_kernel(const int nthreads, const scalar_t *input,
const scalar_t *grid, scalar_t *output,
TensorDesc input_desc, TensorDesc grid_desc,
TensorDesc output_desc,
const GridSamplerInterpolation interpolation_mode,
const GridSamplerPadding padding_mode, bool align_corners) {
int C = input_desc.shape[1];
int inp_D = input_desc.shape[2];
int inp_H = input_desc.shape[3];
int inp_W = input_desc.shape[4];
int out_D = grid_desc.shape[1];
int out_H = grid_desc.shape[2];
int out_W = grid_desc.shape[3];
int inp_sN = input_desc.stride[0];
int inp_sC = input_desc.stride[1];
int inp_sD = input_desc.stride[2];
int inp_sH = input_desc.stride[3];
int inp_sW = input_desc.stride[4];
int grid_sN = grid_desc.stride[0];
int grid_sD = grid_desc.stride[1];
int grid_sH = grid_desc.stride[2];
int grid_sW = grid_desc.stride[3];
int grid_sCoor = grid_desc.stride[4];
int out_sN = output_desc.stride[0];
int out_sC = output_desc.stride[1];
int out_sD = output_desc.stride[2];
int out_sH = output_desc.stride[3];
int out_sW = output_desc.stride[4];
CUDA_1D_KERNEL_LOOP(index, nthreads) {
const int w = index % out_W;
const int h = (index / out_W) % out_H;
const int d = (index / (out_H * out_W)) % out_D;
const int n = index / (out_D * out_H * out_W);
const int grid_offset = n * grid_sN + d * grid_sD + h * grid_sH + w * grid_sW;
// get the corresponding input x, y, z coordinates from grid
scalar_t ix = grid[grid_offset];
scalar_t iy = grid[grid_offset + grid_sCoor];
scalar_t iz = grid[grid_offset + 2 * grid_sCoor];
ix = grid_sampler_compute_source_index(ix, inp_W, padding_mode, align_corners);
iy = grid_sampler_compute_source_index(iy, inp_H, padding_mode, align_corners);
iz = grid_sampler_compute_source_index(iz, inp_D, padding_mode, align_corners);
if (interpolation_mode == GridSamplerInterpolation::Bilinear) {
// get corner pixel values from (x, y, z)
// for 4d, we used north-east-south-west
// for 5d, we add top-bottom
int ix_tnw = static_cast<int>(::floor(ix));
int iy_tnw = static_cast<int>(::floor(iy));
int iz_tnw = static_cast<int>(::floor(iz));
int ix_tne = ix_tnw + 1;
int iy_tne = iy_tnw;
int iz_tne = iz_tnw;
int ix_tsw = ix_tnw;
int iy_tsw = iy_tnw + 1;
int iz_tsw = iz_tnw;
int ix_tse = ix_tnw + 1;
int iy_tse = iy_tnw + 1;
int iz_tse = iz_tnw;
int ix_bnw = ix_tnw;
int iy_bnw = iy_tnw;
int iz_bnw = iz_tnw + 1;
int ix_bne = ix_tnw + 1;
int iy_bne = iy_tnw;
int iz_bne = iz_tnw + 1;
int ix_bsw = ix_tnw;
int iy_bsw = iy_tnw + 1;
int iz_bsw = iz_tnw + 1;
int ix_bse = ix_tnw + 1;
int iy_bse = iy_tnw + 1;
int iz_bse = iz_tnw + 1;
// get surfaces to each neighbor:
scalar_t tnw = (ix_bse - ix) * (iy_bse - iy) * (iz_bse - iz);
scalar_t tne = (ix - ix_bsw) * (iy_bsw - iy) * (iz_bsw - iz);
scalar_t tsw = (ix_bne - ix) * (iy - iy_bne) * (iz_bne - iz);
scalar_t tse = (ix - ix_bnw) * (iy - iy_bnw) * (iz_bnw - iz);
scalar_t bnw = (ix_tse - ix) * (iy_tse - iy) * (iz - iz_tse);
scalar_t bne = (ix - ix_tsw) * (iy_tsw - iy) * (iz - iz_tsw);
scalar_t bsw = (ix_tne - ix) * (iy - iy_tne) * (iz - iz_tne);
scalar_t bse = (ix - ix_tnw) * (iy - iy_tnw) * (iz - iz_tnw);
auto inp_ptr_NC = input + n * inp_sN;
auto out_ptr_NCDHW = output + n * out_sN + d * out_sD + h * out_sH + w * out_sW;
for (int c = 0; c < C; ++c, inp_ptr_NC += inp_sC, out_ptr_NCDHW += out_sC) {
// (c, iz_tnw, iy_tnw, ix_tnw) * tnw + (c, iz_tne, iy_tne, ix_tne) *
// tne
// + (c, iz_tsw, iy_tsw, ix_tsw) * tsw + (c, iz_tse, iy_tse, ix_tse) *
// tse
// + (c, iz_bnw, iy_bnw, ix_bnw) * bnw + (c, iz_bne, iy_bne, ix_bne) *
// bne
// + (c, iz_bsw, iy_bsw, ix_bsw) * bsw + (c, iz_bse, iy_bse, ix_bse) *
// bse
*out_ptr_NCDHW = static_cast<scalar_t>(0);
if (within_bounds_3d(iz_tnw, iy_tnw, ix_tnw, inp_D, inp_H, inp_W)) {
*out_ptr_NCDHW += inp_ptr_NC[iz_tnw * inp_sD + iy_tnw * inp_sH + ix_tnw * inp_sW] * tnw;
}
if (within_bounds_3d(iz_tne, iy_tne, ix_tne, inp_D, inp_H, inp_W)) {
*out_ptr_NCDHW += inp_ptr_NC[iz_tne * inp_sD + iy_tne * inp_sH + ix_tne * inp_sW] * tne;
}
if (within_bounds_3d(iz_tsw, iy_tsw, ix_tsw, inp_D, inp_H, inp_W)) {
*out_ptr_NCDHW += inp_ptr_NC[iz_tsw * inp_sD + iy_tsw * inp_sH + ix_tsw * inp_sW] * tsw;
}
if (within_bounds_3d(iz_tse, iy_tse, ix_tse, inp_D, inp_H, inp_W)) {
*out_ptr_NCDHW += inp_ptr_NC[iz_tse * inp_sD + iy_tse * inp_sH + ix_tse * inp_sW] * tse;
}
if (within_bounds_3d(iz_bnw, iy_bnw, ix_bnw, inp_D, inp_H, inp_W)) {
*out_ptr_NCDHW += inp_ptr_NC[iz_bnw * inp_sD + iy_bnw * inp_sH + ix_bnw * inp_sW] * bnw;
}
if (within_bounds_3d(iz_bne, iy_bne, ix_bne, inp_D, inp_H, inp_W)) {
*out_ptr_NCDHW += inp_ptr_NC[iz_bne * inp_sD + iy_bne * inp_sH + ix_bne * inp_sW] * bne;
}
if (within_bounds_3d(iz_bsw, iy_bsw, ix_bsw, inp_D, inp_H, inp_W)) {
*out_ptr_NCDHW += inp_ptr_NC[iz_bsw * inp_sD + iy_bsw * inp_sH + ix_bsw * inp_sW] * bsw;
}
if (within_bounds_3d(iz_bse, iy_bse, ix_bse, inp_D, inp_H, inp_W)) {
*out_ptr_NCDHW += inp_ptr_NC[iz_bse * inp_sD + iy_bse * inp_sH + ix_bse * inp_sW] * bse;
}
}
} else if (interpolation_mode == GridSamplerInterpolation::Nearest) {
int ix_nearest = static_cast<int>(::round(ix));
int iy_nearest = static_cast<int>(::round(iy));
int iz_nearest = static_cast<int>(::round(iz));
// assign nearest neighbor pixel value to output pixel
auto inp_ptr_NC = input + n * inp_sN;
auto out_ptr_NCDHW = output + n * out_sN + d * out_sD + h * out_sH + w * out_sW;
for (int c = 0; c < C; ++c, inp_ptr_NC += inp_sC, out_ptr_NCDHW += out_sC) {
if (within_bounds_3d(iz_nearest, iy_nearest, ix_nearest, inp_D, inp_H, inp_W)) {
*out_ptr_NCDHW =
inp_ptr_NC[iz_nearest * inp_sD + iy_nearest * inp_sH + ix_nearest * inp_sW];
} else {
*out_ptr_NCDHW = static_cast<scalar_t>(0);
}
}
}
}
}
void create_desc(const int *dims, int nb_dims, TensorDesc &desc) {
memcpy(&desc.shape[0], dims, sizeof(int) * nb_dims);
desc.stride[nb_dims - 1] = 1;
for (int i = nb_dims - 2; i >= 0; --i) {
desc.stride[i] = desc.stride[i + 1] * desc.shape[i + 1];
}
}
template <typename T>
void grid_sample(T *output, const T *input, const T *grid, int *output_dims, int *input_dims,
int *grid_dims, int nb_dims, GridSamplerInterpolation interp,
GridSamplerPadding padding, bool align_corners, cudaStream_t stream) {
TensorDesc input_desc;
create_desc(input_dims, nb_dims, input_desc);
TensorDesc output_desc;
create_desc(output_dims, nb_dims, output_desc);
TensorDesc grid_desc;
create_desc(grid_dims, nb_dims, grid_desc);
int count = 1;
for (int i = 0; i < nb_dims; ++i) {
if (i == 1) {
continue;
}
count *= output_desc.shape[i];
}
if (nb_dims == 4) {
grid_sampler_2d_kernel<T><<<GET_BLOCKS(count), THREADS_PER_BLOCK, 0, stream>>>(
count, input, grid, output, input_desc, grid_desc, output_desc, interp, padding,
align_corners);
} else if (nb_dims == 5) {
grid_sampler_3d_kernel<T><<<GET_BLOCKS(count), THREADS_PER_BLOCK, 0, stream>>>(
count, input, grid, output, input_desc, grid_desc, output_desc, interp, padding,
align_corners);
} else {
printf("input and grid dims should be 4 or 5\n");
}
}
template void grid_sample<float>(float *output, const float *input, const float *grid,
int *output_dims, int *input_dims, int *grid_dims, int nb_dims,
GridSamplerInterpolation interp, GridSamplerPadding padding,
bool align_corners, cudaStream_t stream);
// Copyright (c) OpenMMLab. All rights reserved.
#ifndef TRT_GRID_SAMPLER_KERNEL_HPP
#define TRT_GRID_SAMPLER_KERNEL_HPP
#include <cuda_runtime.h>
enum class GridSamplerInterpolation { Bilinear, Nearest };
enum class GridSamplerPadding { Zeros, Border, Reflection };
template <typename T>
void grid_sample(T *output, const T *input, const T *grid, int *output_dims, int *input_dims,
int *grid_dims, int nb_dims, GridSamplerInterpolation interp,
GridSamplerPadding padding, bool align_corners, cudaStream_t stream);
#endif // TRT_GRID_SAMPLER_KERNEL_HPP
// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
// Modified from:
// https://github.com/NVIDIA/TensorRT/blob/master/plugin/instanceNormalizationPlugin/instanceNormalizationPlugin.cpp
#include "trt_instance_norm.hpp"
#include <cuda_fp16.h>
#include <stdexcept>
#include "trt_serialize.hpp"
using namespace nvinfer1;
namespace mmdeploy {
namespace {
constexpr const char* PLUGIN_VERSION{"1"};
constexpr const char* PLUGIN_NAME{"TRTInstanceNormalization"};
} // namespace
TRTInstanceNormalization::TRTInstanceNormalization(const std::string& name, float epsilon)
: TRTPluginBase(name), mEpsilon(epsilon) {}
TRTInstanceNormalization::TRTInstanceNormalization(const std::string& name, void const* serialData,
size_t serialLength)
: TRTPluginBase(name) {
deserialize_value(&serialData, &serialLength, &mEpsilon);
}
TRTInstanceNormalization::~TRTInstanceNormalization() {}
// TRTInstanceNormalization returns one output.
int TRTInstanceNormalization::getNbOutputs() const TRT_NOEXCEPT { return 1; }
DimsExprs TRTInstanceNormalization::getOutputDimensions(
int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
nvinfer1::IExprBuilder& exprBuilder) TRT_NOEXCEPT {
nvinfer1::DimsExprs output(inputs[0]);
return output;
}
size_t TRTInstanceNormalization::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
int nbInputs,
const nvinfer1::PluginTensorDesc* outputs,
int nbOutputs) const TRT_NOEXCEPT {
int n = inputs[0].dims.d[0];
int c = inputs[0].dims.d[1];
int elem_size = sizeof(float);
return getAlignedSize(n * c * elem_size) * 2;
}
int TRTInstanceNormalization::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
const nvinfer1::PluginTensorDesc* outputDesc,
const void* const* inputs, void* const* outputs,
void* workspace, cudaStream_t stream) TRT_NOEXCEPT {
nvinfer1::Dims input_dims = inputDesc[0].dims;
int n = input_dims.d[0];
int c = input_dims.d[1];
int h = input_dims.d[2];
int w = input_dims.nbDims > 3 ? input_dims.d[3] : 1;
int elem_size = sizeof(float);
void* n_scales = (void*)workspace;
void* n_bias = (void*)((char*)workspace + getAlignedSize(n * c * elem_size));
const void* scales = (const void*)inputs[1];
const void* bias = (const void*)inputs[2];
for (int i = 0; i < n; ++i) {
cudaMemcpyAsync((char*)n_scales + i * c * elem_size, scales, c * elem_size,
cudaMemcpyDeviceToDevice, stream);
cudaMemcpyAsync((char*)n_bias + i * c * elem_size, bias, c * elem_size,
cudaMemcpyDeviceToDevice, stream);
}
cudnnSetTensor4dDescriptor(_b_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, n * c, 1, 1);
cudnnDataType_t cudnn_dtype{};
convert_trt2cudnn_dtype(inputDesc[0].type, &cudnn_dtype);
cudnnSetTensor4dDescriptor(_x_desc, CUDNN_TENSOR_NCHW, cudnn_dtype, 1, n * c, h, w);
cudnnSetTensor4dDescriptor(_y_desc, CUDNN_TENSOR_NCHW, cudnn_dtype, 1, n * c, h, w);
float alpha = 1;
float beta = 0;
void const* x_ptr = inputs[0];
void* y_ptr = outputs[0];
cudnnSetStream(_cudnn_handle, stream);
// Note: Use of CUDNN_BATCHNORM_SPATIAL_PERSISTENT can cause numerical
// overflows (NaNs) for fp32 data in some circumstances. The lower-
// performance CUDNN_BATCHNORM_SPATIAL should be used if this is not
// acceptable.
cudnnBatchNormalizationForwardTraining(_cudnn_handle, CUDNN_BATCHNORM_SPATIAL_PERSISTENT, &alpha,
&beta, _x_desc, x_ptr, _y_desc, y_ptr, _b_desc, n_scales,
n_bias, 1., nullptr, nullptr, mEpsilon, nullptr, nullptr);
return 0;
}
size_t TRTInstanceNormalization::getSerializationSize() const TRT_NOEXCEPT {
return serialized_size(mEpsilon);
}
void TRTInstanceNormalization::serialize(void* buffer) const TRT_NOEXCEPT {
serialize_value(&buffer, mEpsilon);
}
bool TRTInstanceNormalization::supportsFormatCombination(int pos,
const nvinfer1::PluginTensorDesc* ioDesc,
int nbInputs, int nbOutputs) TRT_NOEXCEPT {
switch (pos) {
case 0:
case 3:
return ((ioDesc[pos].type == nvinfer1::DataType::kFLOAT ||
ioDesc[pos].type == nvinfer1::DataType::kHALF) &&
ioDesc[pos].format == nvinfer1::PluginFormat::kLINEAR &&
ioDesc[pos].type == ioDesc[0].type);
case 1:
case 2:
return ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
ioDesc[pos].format == nvinfer1::PluginFormat::kLINEAR;
default:
return false;
}
return false;
}
const char* TRTInstanceNormalization::getPluginType() const TRT_NOEXCEPT { return PLUGIN_NAME; }
const char* TRTInstanceNormalization::getPluginVersion() const TRT_NOEXCEPT {
return PLUGIN_VERSION;
}
IPluginV2DynamicExt* TRTInstanceNormalization::clone() const TRT_NOEXCEPT {
auto* plugin = new TRTInstanceNormalization{mLayerName, mEpsilon};
plugin->setPluginNamespace(mPluginNamespace.c_str());
return plugin;
}
nvinfer1::DataType TRTInstanceNormalization::getOutputDataType(int index,
const nvinfer1::DataType* inputTypes,
int nbInputs) const TRT_NOEXCEPT {
return inputTypes[0];
}
// Attach the plugin object to an execution context and grant the plugin the
// access to some context resource.
void TRTInstanceNormalization::attachToContext(cudnnContext* cudnnContext,
cublasContext* cublasContext,
IGpuAllocator* gpuAllocator) TRT_NOEXCEPT {
_cudnn_handle = cudnnContext;
cudnnCreateTensorDescriptor(&_b_desc);
cudnnCreateTensorDescriptor(&_x_desc);
cudnnCreateTensorDescriptor(&_y_desc);
}
// Detach the plugin object from its execution context.
void TRTInstanceNormalization::detachFromContext() TRT_NOEXCEPT {
if (_y_desc) {
cudnnDestroyTensorDescriptor(_y_desc);
_y_desc = nullptr;
}
if (_x_desc) {
cudnnDestroyTensorDescriptor(_x_desc);
_x_desc = nullptr;
}
if (_b_desc) {
cudnnDestroyTensorDescriptor(_b_desc);
_b_desc = nullptr;
}
}
void TRTInstanceNormalization::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
int nbInputs,
const nvinfer1::DynamicPluginTensorDesc* out,
int nbOutputs) TRT_NOEXCEPT {}
// TRTInstanceNormalizationCreator methods
TRTInstanceNormalizationCreator::TRTInstanceNormalizationCreator() {
mPluginAttributes.clear();
mPluginAttributes.emplace_back(PluginField("epsilon", nullptr, PluginFieldType::kFLOAT32, 1));
mFC.nbFields = mPluginAttributes.size();
mFC.fields = mPluginAttributes.data();
}
const char* TRTInstanceNormalizationCreator::getPluginName() const TRT_NOEXCEPT {
return PLUGIN_NAME;
}
const char* TRTInstanceNormalizationCreator::getPluginVersion() const TRT_NOEXCEPT {
return PLUGIN_VERSION;
}
IPluginV2DynamicExt* TRTInstanceNormalizationCreator::createPlugin(
const char* name, const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT {
float epsilon = 1e-5;
const PluginField* fields = fc->fields;
for (int i = 0; i < fc->nbFields; ++i) {
const char* attrName = fields[i].name;
if (!strcmp(attrName, "epsilon")) {
epsilon = *(static_cast<const float*>(fields[i].data));
}
}
TRTInstanceNormalization* obj = new TRTInstanceNormalization(name, epsilon);
obj->setPluginNamespace(mNamespace.c_str());
return obj;
}
IPluginV2DynamicExt* TRTInstanceNormalizationCreator::deserializePlugin(
const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT {
TRTInstanceNormalization* obj = new TRTInstanceNormalization{name, serialData, serialLength};
obj->setPluginNamespace(mNamespace.c_str());
return obj;
}
REGISTER_TENSORRT_PLUGIN(TRTInstanceNormalizationCreator);
} // namespace mmdeploy
// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
// Modified from:
// https://github.com/NVIDIA/TensorRT/blob/master/plugin/instanceNormalizationPlugin/instanceNormalizationPlugin.h
#ifndef TRT_INSTANCE_NORMALIZATION_HPP
#define TRT_INSTANCE_NORMALIZATION_HPP
#include <cudnn.h>
#include <iostream>
#include <string>
#include <vector>
#include "trt_plugin_base.hpp"
typedef unsigned short half_type;
namespace mmdeploy {
class TRTInstanceNormalization final : public TRTPluginBase {
public:
TRTInstanceNormalization(const std::string& name, float epsilon);
TRTInstanceNormalization(const std::string& name, void const* serialData, size_t serialLength);
TRTInstanceNormalization() = delete;
~TRTInstanceNormalization() TRT_NOEXCEPT override;
int getNbOutputs() const TRT_NOEXCEPT override;
// DynamicExt plugins returns DimsExprs class instead of Dims
nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs* inputs,
int nbInputs, nvinfer1::IExprBuilder& exprBuilder)
TRT_NOEXCEPT override;
size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs,
const nvinfer1::PluginTensorDesc* outputs,
int nbOutputs) const TRT_NOEXCEPT override;
int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
const nvinfer1::PluginTensorDesc* outputDesc, const void* const* inputs,
void* const* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT override;
size_t getSerializationSize() const TRT_NOEXCEPT override;
void serialize(void* buffer) const TRT_NOEXCEPT override;
// DynamicExt plugin supportsFormat update.
bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* ioDesc, int nbInputs,
int nbOutputs) TRT_NOEXCEPT override;
const char* getPluginType() const TRT_NOEXCEPT override;
const char* getPluginVersion() const TRT_NOEXCEPT override;
nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
int nbInputs) const TRT_NOEXCEPT override;
void attachToContext(cudnnContext* cudnn, cublasContext* cublas,
nvinfer1::IGpuAllocator* allocator) TRT_NOEXCEPT override;
void detachFromContext() TRT_NOEXCEPT override;
void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs,
const nvinfer1::DynamicPluginTensorDesc* out,
int nbOutputs) TRT_NOEXCEPT override;
private:
float mEpsilon{};
cudnnHandle_t _cudnn_handle{};
cudnnTensorDescriptor_t _x_desc{}, _y_desc{}, _b_desc{};
std::string mPluginNamespace{};
};
class TRTInstanceNormalizationCreator : public TRTPluginCreatorBase {
public:
TRTInstanceNormalizationCreator();
~TRTInstanceNormalizationCreator() override = default;
const char* getPluginName() const TRT_NOEXCEPT override;
const char* getPluginVersion() const TRT_NOEXCEPT override;
nvinfer1::IPluginV2DynamicExt* createPlugin(
const char* name, const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override;
nvinfer1::IPluginV2DynamicExt* deserializePlugin(const char* name, const void* serialData,
size_t serialLength) TRT_NOEXCEPT override;
};
} // namespace mmdeploy
#endif // TRT_INSTANCE_NORMALIZATION_HPP
// Copyright (c) OpenMMLab. All rights reserved
#include "trt_modulated_deform_conv.hpp"
#include <assert.h>
#include <chrono>
#include "trt_modulated_deform_conv_kernel.hpp"
#include "trt_serialize.hpp"
using namespace nvinfer1;
namespace mmdeploy {
namespace {
static const char *PLUGIN_VERSION{"1"};
static const char *PLUGIN_NAME{"MMCVModulatedDeformConv2d"};
} // namespace
ModulatedDeformableConvPluginDynamic::ModulatedDeformableConvPluginDynamic(
const std::string &name, const nvinfer1::Dims stride, const nvinfer1::Dims padding,
const nvinfer1::Dims dilation, const int deformableGroup, const int group)
: TRTPluginBase(name),
mStride(stride),
mPadding(padding),
mDilation(dilation),
mDeformableGroup(deformableGroup),
mGroup(group) {
mWithBias = false;
}
ModulatedDeformableConvPluginDynamic::ModulatedDeformableConvPluginDynamic(const std::string name,
const void *data,
size_t length)
: TRTPluginBase(name) {
deserialize_value(&data, &length, &mStride);
deserialize_value(&data, &length, &mPadding);
deserialize_value(&data, &length, &mDilation);
deserialize_value(&data, &length, &mDeformableGroup);
deserialize_value(&data, &length, &mGroup);
mWithBias = false;
}
ModulatedDeformableConvPluginDynamic::~ModulatedDeformableConvPluginDynamic() {}
nvinfer1::IPluginV2DynamicExt *ModulatedDeformableConvPluginDynamic::clone() const TRT_NOEXCEPT {
ModulatedDeformableConvPluginDynamic *plugin = new ModulatedDeformableConvPluginDynamic(
mLayerName, mStride, mPadding, mDilation, mDeformableGroup, mGroup);
plugin->setPluginNamespace(getPluginNamespace());
return plugin;
}
static const nvinfer1::IDimensionExpr *get_hw(const nvinfer1::IDimensionExpr *input,
const nvinfer1::IDimensionExpr *weight,
const nvinfer1::IDimensionExpr *stride,
const nvinfer1::IDimensionExpr *pad,
const nvinfer1::IDimensionExpr *dilation,
nvinfer1::IExprBuilder &exprBuilder) {
using DimOp = nvinfer1::DimensionOperation;
auto expr_1 = exprBuilder.constant(1);
// d*(w-1)+1
auto kernel_0 = exprBuilder.operation(DimOp::kSUB, *weight, *expr_1);
auto kernel_1 = exprBuilder.operation(DimOp::kPROD, *dilation, *kernel_0);
auto kernel = exprBuilder.operation(DimOp::kSUM, *kernel_1, *expr_1);
// (1+2*p-k)//stride -1
auto out_0 = exprBuilder.operation(DimOp::kSUM, *pad, *pad);
auto out_1 = exprBuilder.operation(DimOp::kSUM, *input, *out_0);
auto out_2 = exprBuilder.operation(DimOp::kSUB, *out_1, *kernel);
auto out_3 = exprBuilder.operation(DimOp::kFLOOR_DIV, *out_2, *stride);
auto out = exprBuilder.operation(DimOp::kSUM, *out_3, *expr_1);
return out;
}
nvinfer1::DimsExprs ModulatedDeformableConvPluginDynamic::getOutputDimensions(
int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
nvinfer1::IExprBuilder &exprBuilder) TRT_NOEXCEPT {
using DimOp = nvinfer1::DimensionOperation;
auto weight_dim = inputs[3].d;
nvinfer1::DimsExprs ret;
ret.nbDims = 4;
ret.d[0] = inputs[0].d[0];
ret.d[1] = inputs[3].d[0];
auto input_h = inputs[0].d[2];
auto input_w = inputs[0].d[3];
auto weight_h = weight_dim[2];
auto weight_w = weight_dim[3];
auto dilation_w = exprBuilder.constant(mDilation.d[0]);
auto dilation_h = exprBuilder.constant(mDilation.d[1]);
auto pad_w = exprBuilder.constant(mPadding.d[0]);
auto pad_h = exprBuilder.constant(mPadding.d[1]);
auto stride_w = exprBuilder.constant(mStride.d[0]);
auto stride_h = exprBuilder.constant(mStride.d[1]);
auto expr_1 = exprBuilder.constant(1);
auto expr_2 = exprBuilder.constant(2);
ret.d[2] = get_hw(input_h, weight_h, stride_h, pad_h, dilation_h, exprBuilder);
ret.d[3] = get_hw(input_w, weight_w, stride_w, pad_w, dilation_w, exprBuilder);
return ret;
}
bool ModulatedDeformableConvPluginDynamic::supportsFormatCombination(
int pos, const nvinfer1::PluginTensorDesc *ioDesc, int nbInputs, int nbOutputs) TRT_NOEXCEPT {
if (pos == 0) {
return ((ioDesc[pos].type == nvinfer1::DataType::kFLOAT ||
ioDesc[pos].type == nvinfer1::DataType::kHALF) &&
ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR);
} else {
return ioDesc[pos].type == ioDesc[0].type && ioDesc[pos].format == ioDesc[0].format;
}
}
void ModulatedDeformableConvPluginDynamic::configurePlugin(
const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
const nvinfer1::DynamicPluginTensorDesc *outputs, int nbOutputs) TRT_NOEXCEPT {
if (nbInputs == 5) {
mWithBias = true;
}
}
size_t ModulatedDeformableConvPluginDynamic::getWorkspaceSize(
const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const TRT_NOEXCEPT {
int sizeof_dtype = mmdeploy::getElementSize(outputs[0].type);
int batch_size = inputs[0].dims.d[0];
int nInputPlane = inputs[0].dims.d[1];
int inputHeight = inputs[0].dims.d[2];
int inputWidth = inputs[0].dims.d[3];
int nOutputPlane = outputs[0].dims.d[1];
int outputHeight = outputs[0].dims.d[2];
int outputWidth = outputs[0].dims.d[3];
int kW = inputs[3].dims.d[2];
int kH = inputs[3].dims.d[3];
int im2col_step = std::min(32, batch_size);
size_t col_size =
mmdeploy::getAlignedSize(nInputPlane * kW * kH * outputHeight * outputWidth * sizeof_dtype);
return col_size;
}
int ModulatedDeformableConvPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
const nvinfer1::PluginTensorDesc *outputDesc,
const void *const *inputs, void *const *outputs,
void *workSpace,
cudaStream_t stream) TRT_NOEXCEPT {
int batch = inputDesc[0].dims.d[0];
int channels = inputDesc[0].dims.d[1];
int height = inputDesc[0].dims.d[2];
int width = inputDesc[0].dims.d[3];
int channels_out = outputDesc[0].dims.d[1];
int kernel_h = inputDesc[3].dims.d[2];
int kernel_w = inputDesc[3].dims.d[3];
const void *x = inputs[0];
const void *offset = inputs[1];
const void *mask = inputs[2];
const void *weight = inputs[3];
const void *bias = mWithBias ? inputs[4] : nullptr;
void *output = outputs[0];
int im2col_step = std::min(batch, 32);
// TODO: add fp16 support
auto data_type = inputDesc[0].type;
switch (data_type) {
case nvinfer1::DataType::kFLOAT:
ModulatedDeformConvForwardCUDAKernelLauncher<float>(
(float *)x, (float *)weight, (float *)bias, (float *)offset, (float *)mask,
(float *)output, workSpace, batch, channels, height, width, channels_out, kernel_w,
kernel_h, mStride.d[0], mStride.d[1], mPadding.d[0], mPadding.d[1], mDilation.d[0],
mDilation.d[1], mGroup, mDeformableGroup, im2col_step, m_cublas_handle, stream);
break;
case nvinfer1::DataType::kHALF:
ModulatedDeformConvForwardCUDAKernelLauncher<half>(
(half *)x, (half *)weight, (half *)bias, (half *)offset, (half *)mask, (half *)output,
workSpace, batch, channels, height, width, channels_out, kernel_w, kernel_h, mStride.d[0],
mStride.d[1], mPadding.d[0], mPadding.d[1], mDilation.d[0], mDilation.d[1], mGroup,
mDeformableGroup, im2col_step, m_cublas_handle, stream);
break;
default:
return 1;
break;
}
return 0;
}
nvinfer1::DataType ModulatedDeformableConvPluginDynamic::getOutputDataType(
int index, const nvinfer1::DataType *inputTypes, int nbInputs) const TRT_NOEXCEPT {
return inputTypes[0];
}
// IPluginV2 Methods
const char *ModulatedDeformableConvPluginDynamic::getPluginType() const TRT_NOEXCEPT {
return PLUGIN_NAME;
}
const char *ModulatedDeformableConvPluginDynamic::getPluginVersion() const TRT_NOEXCEPT {
return PLUGIN_VERSION;
}
int ModulatedDeformableConvPluginDynamic::getNbOutputs() const TRT_NOEXCEPT { return 1; }
size_t ModulatedDeformableConvPluginDynamic::getSerializationSize() const TRT_NOEXCEPT {
return serialized_size(mStride) + serialized_size(mPadding) + serialized_size(mDilation) +
serialized_size(mDeformableGroup) + serialized_size(mGroup);
}
void ModulatedDeformableConvPluginDynamic::serialize(void *buffer) const TRT_NOEXCEPT {
serialize_value(&buffer, mStride);
serialize_value(&buffer, mPadding);
serialize_value(&buffer, mDilation);
serialize_value(&buffer, mDeformableGroup);
serialize_value(&buffer, mGroup);
}
void ModulatedDeformableConvPluginDynamic::attachToContext(
cudnnContext *cudnnContext, cublasContext *cublasContext,
nvinfer1::IGpuAllocator *gpuAllocator) TRT_NOEXCEPT {
m_cublas_handle = cublasContext;
}
void ModulatedDeformableConvPluginDynamic::detachFromContext() TRT_NOEXCEPT {}
////////////////////// creator /////////////////////////////
ModulatedDeformableConvPluginDynamicCreator::ModulatedDeformableConvPluginDynamicCreator() {
mPluginAttributes.clear();
mPluginAttributes.emplace_back(nvinfer1::PluginField("stride"));
mPluginAttributes.emplace_back(nvinfer1::PluginField("padding"));
mPluginAttributes.emplace_back(nvinfer1::PluginField("dilation"));
mPluginAttributes.emplace_back(nvinfer1::PluginField("groups"));
mPluginAttributes.emplace_back(nvinfer1::PluginField("deform_groups"));
mFC.nbFields = mPluginAttributes.size();
mFC.fields = mPluginAttributes.data();
}
const char *ModulatedDeformableConvPluginDynamicCreator::getPluginName() const TRT_NOEXCEPT {
return PLUGIN_NAME;
}
const char *ModulatedDeformableConvPluginDynamicCreator::getPluginVersion() const TRT_NOEXCEPT {
return PLUGIN_VERSION;
}
nvinfer1::IPluginV2 *ModulatedDeformableConvPluginDynamicCreator::createPlugin(
const char *name, const nvinfer1::PluginFieldCollection *fc) TRT_NOEXCEPT {
nvinfer1::Dims stride{2, {1, 1}};
nvinfer1::Dims padding{2, {0, 0}};
nvinfer1::Dims dilation{2, {1, 1}};
int deformableGroup = 1;
int group = 1;
for (int i = 0; i < fc->nbFields; i++) {
if (fc->fields[i].data == nullptr) {
continue;
}
std::string field_name(fc->fields[i].name);
if (field_name.compare("deform_groups") == 0) {
deformableGroup = static_cast<const int *>(fc->fields[i].data)[0];
}
if (field_name.compare("groups") == 0) {
group = static_cast<const int *>(fc->fields[i].data)[0];
}
if (field_name.compare("stride") == 0) {
stride.nbDims = 2;
stride.d[0] = static_cast<const int *>(fc->fields[i].data)[0];
stride.d[1] = static_cast<const int *>(fc->fields[i].data)[1];
}
if (field_name.compare("padding") == 0) {
padding.nbDims = 2;
padding.d[0] = static_cast<const int *>(fc->fields[i].data)[0];
padding.d[1] = static_cast<const int *>(fc->fields[i].data)[1];
}
if (field_name.compare("dilation") == 0) {
dilation.nbDims = 2;
dilation.d[0] = static_cast<const int *>(fc->fields[i].data)[0];
dilation.d[1] = static_cast<const int *>(fc->fields[i].data)[1];
}
}
ModulatedDeformableConvPluginDynamic *plugin = new ModulatedDeformableConvPluginDynamic(
name, stride, padding, dilation, deformableGroup, group);
plugin->setPluginNamespace(getPluginNamespace());
return plugin;
}
nvinfer1::IPluginV2 *ModulatedDeformableConvPluginDynamicCreator::deserializePlugin(
const char *name, const void *serialData, size_t serialLength) TRT_NOEXCEPT {
auto plugin = new ModulatedDeformableConvPluginDynamic(name, serialData, serialLength);
plugin->setPluginNamespace(getPluginNamespace());
return plugin;
}
REGISTER_TENSORRT_PLUGIN(ModulatedDeformableConvPluginDynamicCreator);
} // namespace mmdeploy
// Copyright (c) OpenMMLab. All rights reserved.
#ifndef TRT_MODULATED_DEFORM_CONV_HPP
#define TRT_MODULATED_DEFORM_CONV_HPP
#include <cublas_v2.h>
#include <memory>
#include <string>
#include <vector>
#include "trt_plugin_base.hpp"
namespace mmdeploy {
class ModulatedDeformableConvPluginDynamic : public TRTPluginBase {
public:
ModulatedDeformableConvPluginDynamic(const std::string &name, const nvinfer1::Dims stride,
const nvinfer1::Dims padding, const nvinfer1::Dims dilation,
const int deformableGroup, const int group);
ModulatedDeformableConvPluginDynamic(const std::string name, const void *data, size_t length);
ModulatedDeformableConvPluginDynamic() = delete;
~ModulatedDeformableConvPluginDynamic() TRT_NOEXCEPT override;
// IPluginV2DynamicExt Methods
nvinfer1::IPluginV2DynamicExt *clone() const TRT_NOEXCEPT override;
nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs,
int nbInputs, nvinfer1::IExprBuilder &exprBuilder)
TRT_NOEXCEPT override;
bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *ioDesc, int nbInputs,
int nbOutputs) TRT_NOEXCEPT override;
void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs,
const nvinfer1::DynamicPluginTensorDesc *out,
int nbOutputs) TRT_NOEXCEPT override;
size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
const nvinfer1::PluginTensorDesc *outputs,
int nbOutputs) const TRT_NOEXCEPT override;
int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
void *const *outputs, void *workspace, cudaStream_t stream) TRT_NOEXCEPT override;
void attachToContext(cudnnContext *cudnnContext, cublasContext *cublasContext,
nvinfer1::IGpuAllocator *gpuAllocator) TRT_NOEXCEPT override;
void detachFromContext() TRT_NOEXCEPT override;
// IPluginV2Ext Methods
nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType *inputTypes,
int nbInputs) const TRT_NOEXCEPT override;
// IPluginV2 Methods
const char *getPluginType() const TRT_NOEXCEPT override;
const char *getPluginVersion() const TRT_NOEXCEPT override;
int getNbOutputs() const TRT_NOEXCEPT override;
size_t getSerializationSize() const TRT_NOEXCEPT override;
void serialize(void *buffer) const TRT_NOEXCEPT override;
private:
nvinfer1::Dims mStride;
nvinfer1::Dims mPadding;
nvinfer1::Dims mDilation;
int mDeformableGroup;
int mGroup;
bool mWithBias;
cublasHandle_t m_cublas_handle;
};
class ModulatedDeformableConvPluginDynamicCreator : public TRTPluginCreatorBase {
public:
ModulatedDeformableConvPluginDynamicCreator();
const char *getPluginName() const TRT_NOEXCEPT override;
const char *getPluginVersion() const TRT_NOEXCEPT override;
nvinfer1::IPluginV2 *createPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
TRT_NOEXCEPT override;
nvinfer1::IPluginV2 *deserializePlugin(const char *name, const void *serialData,
size_t serialLength) TRT_NOEXCEPT override;
};
} // namespace mmdeploy
#endif // TRT_MODULATED_DEFORM_CONV_HPP
// Copyright (c) OpenMMLab. All rights reserved
#include <assert.h>
#include <cuda_fp16.h>
#include "common_cuda_helper.hpp"
#include "modulated_deform_conv/modulated_deform_conv_cuda.cuh"
#include "trt_modulated_deform_conv_kernel.hpp"
#include "trt_plugin_helper.hpp"
template <typename T>
void trt_modulated_deformable_im2col(const T* data_im_, const T* data_offset_, const T* data_mask_,
const int batch_size, const int channels, const int height_im,
const int width_im, const int height_col, const int width_col,
const int kernel_h, const int kenerl_w, const int pad_h,
const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int deformable_group, T* data_col_,
cudaStream_t stream) {
// num_axes should be smaller than block size
const int channel_per_deformable_group = channels / deformable_group;
const int num_kernels = channels * batch_size * height_col * width_col;
modulated_deformable_im2col_gpu_kernel<T>
<<<GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>(
num_kernels, data_im_, data_offset_, data_mask_, height_im, width_im, kernel_h, kenerl_w,
pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group,
batch_size, channels, deformable_group, height_col, width_col, data_col_);
cudaCheckError();
}
template <typename scalar_t>
__global__ void output_add_bias_kernel(scalar_t* output, const scalar_t* bias, size_t step_batch,
size_t step_channel, size_t n) {
CUDA_1D_KERNEL_LOOP(index, n) { output[index] += bias[(index % step_batch) / step_channel]; }
}
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
template <>
__global__ void output_add_bias_kernel<__half>(__half* output, const __half* bias,
size_t step_batch, size_t step_channel, size_t n) {
CUDA_1D_KERNEL_LOOP(index, n) {
const __half b = bias[(index % step_batch) / step_channel];
const __half o = output[index];
output[index] = __hadd(o, b);
}
}
#else
template <>
__global__ void output_add_bias_kernel<__half>(__half* output, const __half* bias,
size_t step_batch, size_t step_channel, size_t n) {
CUDA_1D_KERNEL_LOOP(index, n) {
const __half b = bias[(index % step_batch) / step_channel];
const __half o = output[index];
output[index] = __float2half(__half2float(o) + __half2float(b));
}
}
#endif
template <typename scalar_t>
static void output_add_bias(scalar_t* output, const scalar_t* bias, size_t batch, size_t channel,
size_t height, size_t width, cudaStream_t stream) {
size_t step_channel = height * width;
size_t step_batch = step_channel * channel;
size_t n = step_batch * batch;
output_add_bias_kernel<<<GET_BLOCKS(n), THREADS_PER_BLOCK, 0, stream>>>(output, bias, step_batch,
step_channel, n);
}
template <typename scalar_t>
void ModulatedDeformConvForwardCUDAKernelLauncher(
const scalar_t* input, const scalar_t* weight, const scalar_t* bias, const scalar_t* offset,
const scalar_t* mask, scalar_t* output, void* workspace, int batch, int channels, int height,
int width, int channels_out, int kernel_w, int kernel_h, int stride_w, int stride_h, int pad_w,
int pad_h, int dilation_w, int dilation_h, int group, int deformable_group, int im2col_step,
cublasHandle_t cublas_handle, cudaStream_t stream) {
bool with_bias = (bias != nullptr);
im2col_step = std::min(int(batch), im2col_step);
assert(batch % im2col_step == 0);
const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
scalar_t* columns = (scalar_t*)workspace;
const size_t input_step = channels * height * width;
const size_t offset_step = deformable_group * kernel_h * kernel_w * 2 * height_out * width_out;
const size_t mask_step = deformable_group * kernel_h * kernel_w * height_out * width_out;
const size_t out_step = channels_out * height_out * width_out;
const size_t out_group_step = out_step / group;
const size_t col_g_step = channels * kernel_w * kernel_h / group * height_out * width_out;
const size_t weight_g_step = channels_out / group * channels / group * kernel_h * kernel_w;
const int m = channels_out / group;
const int n = height_out * width_out;
const int k = channels / group * kernel_h * kernel_w;
scalar_t alpha = 1.;
scalar_t beta = 0.;
for (int b = 0; b < batch; b++) {
const scalar_t* input_start = input + b * input_step;
const scalar_t* offset_start = offset + b * offset_step;
const scalar_t* mask_start = mask + b * mask_step;
trt_modulated_deformable_im2col<scalar_t>(
input_start, offset_start, mask_start, 1, channels, height, width, height_out, width_out,
kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
deformable_group, columns, stream);
for (int g = 0; g < group; g++) {
const scalar_t* weight_start = weight + g * weight_g_step;
scalar_t* col_start = columns + g * col_g_step;
scalar_t* out_buffer_start = output + b * out_step + g * out_group_step;
cublasGemmWrap<scalar_t>(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &alpha, col_start,
n, weight_start, k, &beta, out_buffer_start, n);
cudaCheckError();
}
}
if (with_bias) {
output_add_bias<scalar_t>(output, bias, batch, channels_out, height_out, width_out, stream);
}
}
template void ModulatedDeformConvForwardCUDAKernelLauncher<float>(
const float* input, const float* weight, const float* bias, const float* offset,
const float* mask, float* output, void* workspace, int batch, int channels, int height,
int width, int channels_out, int kernel_w, int kernel_h, int stride_w, int stride_h, int pad_w,
int pad_h, int dilation_w, int dilation_h, int group, int deformable_group, int im2col_step,
cublasHandle_t cublas_handle, cudaStream_t stream);
template void ModulatedDeformConvForwardCUDAKernelLauncher<__half>(
const __half* input, const __half* weight, const __half* bias, const __half* offset,
const __half* mask, __half* output, void* workspace, int batch, int channels, int height,
int width, int channels_out, int kernel_w, int kernel_h, int stride_w, int stride_h, int pad_w,
int pad_h, int dilation_w, int dilation_h, int group, int deformable_group, int im2col_step,
cublasHandle_t cublas_handle, cudaStream_t stream);
// Copyright (c) OpenMMLab. All rights reserved
#ifndef TRT_MODULATED_DEFORM_CONV_KERNEL_HPP
#define TRT_MODULATED_DEFORM_CONV_KERNEL_HPP
#include <cublas_v2.h>
#include <cuda_runtime.h>
template <typename scalar_t>
void ModulatedDeformConvForwardCUDAKernelLauncher(
const scalar_t* input, const scalar_t* weight, const scalar_t* bias, const scalar_t* offset,
const scalar_t* mask, scalar_t* output, void* workspace, int batch, int channels, int height,
int width, int channels_out, int kernel_w, int kernel_h, int stride_w, int stride_h, int pad_w,
int pad_h, int dilation_w, int dilation_h, int group, int deformable_group, int im2col_step,
cublasHandle_t cublas_handle, cudaStream_t stream);
#endif
// Copyright (c) OpenMMLab. All rights reserved.
#include "trt_multi_level_roi_align.hpp"
#include <assert.h>
#include <chrono>
#include "trt_multi_level_roi_align_kernel.hpp"
#include "trt_plugin_helper.hpp"
#include "trt_serialize.hpp"
namespace mmdeploy {
namespace {
static const char *PLUGIN_VERSION{"1"};
static const char *PLUGIN_NAME{"MMCVMultiLevelRoiAlign"};
} // namespace
TRTMultiLevelRoiAlign::TRTMultiLevelRoiAlign(const std::string &name, int alignedHeight,
int alignedWidth, int poolMode, int sampleNum,
const std::vector<float> &featmapStrides,
float roiScaleFactor, int finestScale, bool aligned)
: TRTPluginBase(name),
mAlignedHeight(alignedHeight),
mAlignedWidth(alignedWidth),
mPoolMode(poolMode),
mSampleNum(sampleNum),
mFeatmapStrides(featmapStrides),
mRoiScaleFactor(roiScaleFactor),
mFinestScale(finestScale),
mAligned(aligned) {}
TRTMultiLevelRoiAlign::TRTMultiLevelRoiAlign(const std::string name, const void *data,
size_t length)
: TRTPluginBase(name) {
deserialize_value(&data, &length, &mAlignedHeight);
deserialize_value(&data, &length, &mAlignedWidth);
deserialize_value(&data, &length, &mPoolMode);
deserialize_value(&data, &length, &mSampleNum);
deserialize_value(&data, &length, &mRoiScaleFactor);
deserialize_value(&data, &length, &mFinestScale);
deserialize_value(&data, &length, &mAligned);
deserialize_value(&data, &length, &mFeatmapStrides);
}
nvinfer1::IPluginV2DynamicExt *TRTMultiLevelRoiAlign::clone() const TRT_NOEXCEPT {
TRTMultiLevelRoiAlign *plugin =
new TRTMultiLevelRoiAlign(mLayerName, mAlignedHeight, mAlignedWidth, mPoolMode, mSampleNum,
mFeatmapStrides, mRoiScaleFactor, mFinestScale, mAligned);
plugin->setPluginNamespace(getPluginNamespace());
return plugin;
}
nvinfer1::DimsExprs TRTMultiLevelRoiAlign::getOutputDimensions(
int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
nvinfer1::IExprBuilder &exprBuilder) TRT_NOEXCEPT {
// warning, nbInputs should equal to mFeatmapStrides.size() + 1
nvinfer1::DimsExprs ret;
ret.nbDims = 4;
ret.d[0] = inputs[0].d[0];
ret.d[1] = inputs[1].d[1];
ret.d[2] = exprBuilder.constant(mAlignedHeight);
ret.d[3] = exprBuilder.constant(mAlignedWidth);
return ret;
}
bool TRTMultiLevelRoiAlign::supportsFormatCombination(int pos,
const nvinfer1::PluginTensorDesc *ioDesc,
int nbInputs, int nbOutputs) TRT_NOEXCEPT {
return ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR;
}
void TRTMultiLevelRoiAlign::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *inputs,
int nbInputs,
const nvinfer1::DynamicPluginTensorDesc *outputs,
int nbOutputs) TRT_NOEXCEPT {
// Validate input arguments
ASSERT(nbOutputs == 1);
ASSERT(nbInputs >= 1);
mFeatmapStrides =
std::vector<float>(mFeatmapStrides.begin(), mFeatmapStrides.begin() + (nbInputs - 1));
}
size_t TRTMultiLevelRoiAlign::getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs,
int nbInputs,
const nvinfer1::PluginTensorDesc *outputs,
int nbOutputs) const TRT_NOEXCEPT {
return 0;
}
int TRTMultiLevelRoiAlign::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
const nvinfer1::PluginTensorDesc *outputDesc,
const void *const *inputs, void *const *outputs, void *workSpace,
cudaStream_t stream) TRT_NOEXCEPT {
int num_rois = inputDesc[0].dims.d[0];
int batch_size = inputDesc[1].dims.d[0];
int channels = inputDesc[1].dims.d[1];
const int kMaxFeatMap = 10;
int heights[kMaxFeatMap];
int widths[kMaxFeatMap];
float strides[kMaxFeatMap];
int num_feats = mFeatmapStrides.size();
for (int i = 0; i < num_feats; ++i) {
heights[i] = inputDesc[i + 1].dims.d[2];
widths[i] = inputDesc[i + 1].dims.d[3];
strides[i] = mFeatmapStrides[i];
}
const void *rois = inputs[0];
const void *const *feats = inputs + 1;
multi_level_roi_align<float>((float *)outputs[0], (const float *)rois, num_rois, feats, num_feats,
batch_size, channels, &heights[0], &widths[0], &strides[0],
mAlignedHeight, mAlignedWidth, mPoolMode, mSampleNum,
mRoiScaleFactor, mFinestScale, mAligned, stream);
return 0;
}
nvinfer1::DataType TRTMultiLevelRoiAlign::getOutputDataType(int index,
const nvinfer1::DataType *inputTypes,
int nbInputs) const TRT_NOEXCEPT {
return nvinfer1::DataType::kFLOAT;
}
// IPluginV2 Methods
const char *TRTMultiLevelRoiAlign::getPluginType() const TRT_NOEXCEPT { return PLUGIN_NAME; }
const char *TRTMultiLevelRoiAlign::getPluginVersion() const TRT_NOEXCEPT { return PLUGIN_VERSION; }
int TRTMultiLevelRoiAlign::getNbOutputs() const TRT_NOEXCEPT { return 1; }
size_t TRTMultiLevelRoiAlign::getSerializationSize() const TRT_NOEXCEPT {
return serialized_size(mFeatmapStrides) + serialized_size(mAlignedHeight) +
serialized_size(mAlignedWidth) + serialized_size(mPoolMode) + serialized_size(mSampleNum) +
serialized_size(mRoiScaleFactor) + serialized_size(mFinestScale) +
serialized_size(mAligned);
}
void TRTMultiLevelRoiAlign::serialize(void *buffer) const TRT_NOEXCEPT {
serialize_value(&buffer, mAlignedHeight);
serialize_value(&buffer, mAlignedWidth);
serialize_value(&buffer, mPoolMode);
serialize_value(&buffer, mSampleNum);
serialize_value(&buffer, mRoiScaleFactor);
serialize_value(&buffer, mFinestScale);
serialize_value(&buffer, mAligned);
serialize_value(&buffer, mFeatmapStrides);
}
TRTMultiLevelRoiAlignCreator::TRTMultiLevelRoiAlignCreator() {
mPluginAttributes = std::vector<nvinfer1::PluginField>(
{nvinfer1::PluginField("output_height"), nvinfer1::PluginField("output_width"),
nvinfer1::PluginField("pool_mode"), nvinfer1::PluginField("sampling_ratio"),
nvinfer1::PluginField("featmap_strides"), nvinfer1::PluginField("roi_scale_factor"),
nvinfer1::PluginField("finest_scale"), nvinfer1::PluginField("aligned")});
mFC.nbFields = mPluginAttributes.size();
mFC.fields = mPluginAttributes.data();
}
const char *TRTMultiLevelRoiAlignCreator::getPluginName() const TRT_NOEXCEPT { return PLUGIN_NAME; }
const char *TRTMultiLevelRoiAlignCreator::getPluginVersion() const TRT_NOEXCEPT {
return PLUGIN_VERSION;
}
nvinfer1::IPluginV2 *TRTMultiLevelRoiAlignCreator::createPlugin(
const char *name, const nvinfer1::PluginFieldCollection *fc) TRT_NOEXCEPT {
int alignedHeight = 7;
int alignedWidth = 7;
int poolMode = 0;
int sampleNum = 2;
std::vector<float> featmapStrides;
float roiScaleFactor = -1;
int finestScale = 56;
bool aligned = false;
for (int i = 0; i < fc->nbFields; i++) {
if (fc->fields[i].data == nullptr) {
continue;
}
std::string field_name(fc->fields[i].name);
if (field_name.compare("output_height") == 0) {
alignedHeight = static_cast<const int *>(fc->fields[i].data)[0];
} else if (field_name.compare("output_width") == 0) {
alignedWidth = static_cast<const int *>(fc->fields[i].data)[0];
} else if (field_name.compare("pool_mode") == 0) {
poolMode = static_cast<const int *>(fc->fields[i].data)[0];
} else if (field_name.compare("sampling_ratio") == 0) {
sampleNum = static_cast<const int *>(fc->fields[i].data)[0];
} else if (field_name.compare("roi_scale_factor") == 0) {
roiScaleFactor = static_cast<const float *>(fc->fields[i].data)[0];
} else if (field_name.compare("finest_scale") == 0) {
finestScale = static_cast<const int *>(fc->fields[i].data)[0];
} else if (field_name.compare("featmap_strides") == 0) {
int data_size = (fc->fields[i].length);
const float *data_start = static_cast<const float *>(fc->fields[i].data);
featmapStrides = std::vector<float>(data_start, data_start + data_size);
} else if (field_name.compare("aligned") == 0) {
int aligned_int = static_cast<const int *>(fc->fields[i].data)[0];
aligned = aligned_int != 0;
}
}
ASSERT(featmapStrides.size() != 0);
TRTMultiLevelRoiAlign *plugin =
new TRTMultiLevelRoiAlign(name, alignedHeight, alignedWidth, poolMode, sampleNum,
featmapStrides, roiScaleFactor, finestScale, aligned);
plugin->setPluginNamespace(getPluginNamespace());
return plugin;
}
nvinfer1::IPluginV2 *TRTMultiLevelRoiAlignCreator::deserializePlugin(
const char *name, const void *serialData, size_t serialLength) TRT_NOEXCEPT {
auto plugin = new TRTMultiLevelRoiAlign(name, serialData, serialLength);
plugin->setPluginNamespace(getPluginNamespace());
return plugin;
}
REGISTER_TENSORRT_PLUGIN(TRTMultiLevelRoiAlignCreator);
} // namespace mmdeploy
// Copyright (c) OpenMMLab. All rights reserved.
#ifndef TRT_MULTI_LEVEL_ROI_ALIGN_HPP
#define TRT_MULTI_LEVEL_ROI_ALIGN_HPP
#include <cublas_v2.h>
#include <memory>
#include <string>
#include <vector>
#include "trt_plugin_base.hpp"
namespace mmdeploy {
class TRTMultiLevelRoiAlign : public TRTPluginBase {
public:
TRTMultiLevelRoiAlign(const std::string &name, int alignedHeight, int alignedWidth, int poolMode,
int sampleNum, const std::vector<float> &featmapStrides,
float roiScaleFactor = -1, int finestScale = 56, bool aligned = false);
TRTMultiLevelRoiAlign(const std::string name, const void *data, size_t length);
TRTMultiLevelRoiAlign() = delete;
// IPluginV2DynamicExt Methods
nvinfer1::IPluginV2DynamicExt *clone() const TRT_NOEXCEPT override;
nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs,
int nbInputs, nvinfer1::IExprBuilder &exprBuilder)
TRT_NOEXCEPT override;
bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *ioDesc, int nbInputs,
int nbOutputs) TRT_NOEXCEPT override;
void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs,
const nvinfer1::DynamicPluginTensorDesc *out,
int nbOutputs) TRT_NOEXCEPT override;
size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
const nvinfer1::PluginTensorDesc *outputs,
int nbOutputs) const TRT_NOEXCEPT override;
int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
void *const *outputs, void *workspace, cudaStream_t stream) TRT_NOEXCEPT override;
// IPluginV2Ext Methods
nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType *inputTypes,
int nbInputs) const TRT_NOEXCEPT override;
// IPluginV2 Methods
const char *getPluginType() const TRT_NOEXCEPT override;
const char *getPluginVersion() const TRT_NOEXCEPT override;
int getNbOutputs() const TRT_NOEXCEPT override;
size_t getSerializationSize() const TRT_NOEXCEPT override;
void serialize(void *buffer) const TRT_NOEXCEPT override;
private:
int mAlignedHeight;
int mAlignedWidth;
int mPoolMode;
int mSampleNum;
std::vector<float> mFeatmapStrides;
float mRoiScaleFactor;
int mFinestScale;
bool mAligned;
};
class TRTMultiLevelRoiAlignCreator : public TRTPluginCreatorBase {
public:
TRTMultiLevelRoiAlignCreator();
const char *getPluginName() const TRT_NOEXCEPT override;
const char *getPluginVersion() const TRT_NOEXCEPT override;
nvinfer1::IPluginV2 *createPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
TRT_NOEXCEPT override;
nvinfer1::IPluginV2 *deserializePlugin(const char *name, const void *serialData,
size_t serialLength) TRT_NOEXCEPT override;
};
} // namespace mmdeploy
#endif // TRT_ROI_ALIGN_HPP
// Copyright (c) OpenMMLab. All rights reserved.
#include <float.h>
#include <stdio.h>
#include <algorithm>
#include <cmath>
#include "common_cuda_helper.hpp"
#include "trt_multi_level_roi_align_kernel.hpp"
#include "trt_plugin_helper.hpp"
const int kMAX_FEATMAP_SIZE = 10;
struct FeatData {
const void *data[kMAX_FEATMAP_SIZE];
int batch_size;
int channels;
int h[kMAX_FEATMAP_SIZE];
int w[kMAX_FEATMAP_SIZE];
float spatial_scale[kMAX_FEATMAP_SIZE];
int num_featmap;
};
template <typename scalar_t, bool aligned, int pool_mode>
__device__ scalar_t roi_align_single(const scalar_t *__restrict__ bottom_data,
const int roi_batch_ind, const scalar_t roi_start_w,
const scalar_t roi_start_h, const scalar_t roi_end_w,
const scalar_t roi_end_h, const scalar_t spatial_scale,
const int pw, const int ph, const int c, const int sample_num,
const int channels, const int height, const int width,
const int pooled_height, const int pooled_width) {
// Force malformed ROIs to be 1x1
scalar_t roi_width = max(roi_end_w - roi_start_w, (scalar_t)(aligned ? 0. : 1.));
scalar_t roi_height = max(roi_end_h - roi_start_h, (scalar_t)(aligned ? 0. : 1.));
const scalar_t bin_size_h = roi_height / pooled_height;
const scalar_t bin_size_w = roi_width / pooled_width;
const scalar_t *offset_bottom_data =
bottom_data + (roi_batch_ind * channels + c) * height * width;
const int sample_num_h = (sample_num > 0) ? sample_num : ceil(roi_height / pooled_height);
const int sample_num_w = (sample_num > 0) ? sample_num : ceil(roi_width / pooled_width);
scalar_t output_val = (pool_mode == 0) ? -FLT_MAX : 0;
const scalar_t y_offset = roi_start_h + ph * bin_size_h;
const scalar_t y_scale = bin_size_h / (scalar_t)(sample_num_h);
const scalar_t x_offset = roi_start_w + pw * bin_size_w;
const scalar_t x_scale = bin_size_w / (scalar_t)(sample_num_w);
for (int iy = 0; iy < sample_num_h; iy++) {
const scalar_t y = fma(scalar_t(iy) + scalar_t(.5f), y_scale, y_offset);
for (int ix = 0; ix < sample_num_w; ix++) {
const scalar_t x = fma(scalar_t(ix) + scalar_t(.5f), x_scale, x_offset);
scalar_t val = bilinear_interpolate<scalar_t>(offset_bottom_data, height, width, y, x);
if (pool_mode == 0) {
output_val = max(output_val, val);
} else {
output_val += val;
}
}
}
if (pool_mode != 0) {
output_val /= max(sample_num_h * sample_num_w, 1);
}
return output_val;
}
template <typename scalar_t, bool aligned>
__global__ void roi_extractor_kernel(scalar_t *__restrict__ output,
const scalar_t *__restrict__ bottom_rois, FeatData feat_data,
const int pool_mode, const int sample_num,
const float roi_scale_factor, const int finest_scale,
const int pooled_height, const int pooled_width,
int nThreads) {
CUDA_1D_KERNEL_LOOP(index, nThreads) {
const int channels = feat_data.channels;
int tmp_index = index;
const int pw = tmp_index % pooled_width;
tmp_index /= pooled_width;
const int ph = tmp_index % pooled_height;
tmp_index /= pooled_height;
const int c = tmp_index % channels;
const int n = tmp_index / channels;
const scalar_t *offset_bottom_rois = bottom_rois + n * 5;
scalar_t roi_offset_x0 = offset_bottom_rois[1];
scalar_t roi_offset_y0 = offset_bottom_rois[2];
scalar_t roi_offset_x1 = offset_bottom_rois[3];
scalar_t roi_offset_y1 = offset_bottom_rois[4];
const scalar_t scale = sqrtf((roi_offset_y1 - roi_offset_y0) * (roi_offset_x1 - roi_offset_x0));
const int target_lvls =
min(feat_data.num_featmap - 1,
max(0, int(floorf(log2f(scale / (scalar_t)(finest_scale) + 1e-6)))));
if (roi_scale_factor > 0.) {
const scalar_t roi_off_cx = (roi_offset_x0 + roi_offset_x1) * 0.5;
const scalar_t roi_off_cy = (roi_offset_y0 + roi_offset_y1) * 0.5;
const scalar_t half_scale_factor = roi_scale_factor * 0.5;
const scalar_t half_roi_off_w =
fma(roi_offset_x1 - roi_offset_x0 + 1, half_scale_factor, scalar_t(-0.5));
const scalar_t half_roi_off_h =
fma(roi_offset_y1 - roi_offset_y0 + 1, half_scale_factor, scalar_t(-0.5));
roi_offset_x0 = roi_off_cx - half_roi_off_w;
roi_offset_x1 = roi_off_cx + half_roi_off_w;
roi_offset_y0 = roi_off_cy - half_roi_off_h;
roi_offset_y1 = roi_off_cy + half_roi_off_h;
}
const scalar_t spatial_scale = (scalar_t)feat_data.spatial_scale[target_lvls];
const int height = feat_data.h[target_lvls];
const int width = feat_data.w[target_lvls];
const scalar_t *bottom_data = (scalar_t *)feat_data.data[target_lvls];
const int roi_batch_ind = offset_bottom_rois[0];
const scalar_t offset = aligned ? (scalar_t)-0.5 : (scalar_t)0.0;
const scalar_t roi_start_w =
fma(roi_offset_x0, spatial_scale, offset); // roi_offset_x0 * spatial_scale + offset;
const scalar_t roi_start_h =
fma(roi_offset_y0, spatial_scale, offset); // roi_offset_y0 * spatial_scale + offset;
const scalar_t roi_end_w =
fma(roi_offset_x1, spatial_scale, offset); // (roi_offset_x1) * spatial_scale - offset;
const scalar_t roi_end_h =
fma(roi_offset_y1, spatial_scale, offset); // (roi_offset_y1)*spatial_scale - offset;
if (pool_mode == 0) {
const scalar_t output_val = roi_align_single<scalar_t, aligned, 0>(
bottom_data, roi_batch_ind, roi_start_w, roi_start_h, roi_end_w, roi_end_h, spatial_scale,
pw, ph, c, sample_num, channels, height, width, pooled_height, pooled_width);
output[index] = output_val;
} else {
const scalar_t output_val = roi_align_single<scalar_t, aligned, 1>(
bottom_data, roi_batch_ind, roi_start_w, roi_start_h, roi_end_w, roi_end_h, spatial_scale,
pw, ph, c, sample_num, channels, height, width, pooled_height, pooled_width);
output[index] = output_val;
}
}
}
template <typename T>
void multi_level_roi_align(T *output, const T *rois, int num_rois, const void *const *feats,
int num_feats, int n, int c, int *h, int *w, float *strides,
int aligned_height, int aligned_width, int pool_mode, int sample_num,
float roi_scale_factor, int finest_scale, bool aligned,
cudaStream_t stream) {
FeatData feat_data;
feat_data.batch_size = n;
feat_data.channels = c;
feat_data.num_featmap = num_feats;
for (int i = 0; i < num_feats; ++i) {
feat_data.data[i] = feats[i];
feat_data.h[i] = h[i];
feat_data.w[i] = w[i];
feat_data.spatial_scale[i] = 1. / float(strides[i]);
}
int nThreads = num_rois * c * aligned_height * aligned_width;
if (aligned) {
roi_extractor_kernel<T, true><<<GET_BLOCKS(nThreads), THREADS_PER_BLOCK, 0, stream>>>(
output, rois, feat_data, pool_mode, sample_num, roi_scale_factor, finest_scale,
aligned_height, aligned_width, nThreads);
} else {
roi_extractor_kernel<T, false><<<GET_BLOCKS(nThreads), THREADS_PER_BLOCK, 0, stream>>>(
output, rois, feat_data, pool_mode, sample_num, roi_scale_factor, finest_scale,
aligned_height, aligned_width, nThreads);
}
}
template void multi_level_roi_align<float>(float *output, const float *rois, int num_rois,
const void *const *feats, int num_feats, int n, int c,
int *h, int *w, float *strides, int aligned_height,
int aligned_width, int pool_mode, int sample_num,
float roi_scale_factor, int finest_scale, bool aligned,
cudaStream_t stream);
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment