Commit 546b4279 authored by limm's avatar limm
Browse files

add csrc and mmdeploy module

parent 502f4fb9
Pipeline #2810 canceled with stages
// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
// modify from
// https://github.com/NVIDIA/TensorRT/tree/master/plugin/batchedNMSPlugin
#include "kernel.h"
template <typename KeyT, typename ValueT>
size_t cubSortPairsWorkspaceSize(int num_items, int num_segments) {
size_t temp_storage_bytes = 0;
cub::DeviceSegmentedRadixSort::SortPairsDescending((void*)NULL, temp_storage_bytes,
(const KeyT*)NULL, (KeyT*)NULL,
(const ValueT*)NULL, (ValueT*)NULL,
num_items, // # items
num_segments, // # segments
(const int*)NULL, (const int*)NULL);
return temp_storage_bytes;
}
// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
// modify from
// https://github.com/NVIDIA/TensorRT/tree/master/plugin/batchedNMSPlugin
#ifndef TRT_KERNEL_H
#define TRT_KERNEL_H
#include <cuda_runtime.h>
#include <cassert>
#include <cstdio>
#include "cublas_v2.h"
#include "trt_plugin_helper.hpp"
using namespace nvinfer1;
#define DEBUG_ENABLE 0
template <typename T>
struct Bbox {
T xmin, ymin, xmax, ymax;
Bbox(T xmin, T ymin, T xmax, T ymax) : xmin(xmin), ymin(ymin), xmax(xmax), ymax(ymax) {}
Bbox() = default;
};
size_t get_cuda_arch(int devID);
int8_t* alignPtr(int8_t* ptr, uintptr_t to);
int8_t* nextWorkspacePtr(int8_t* ptr, uintptr_t previousWorkspaceSize);
void setUniformOffsets(cudaStream_t stream, int num_segments, int offset, int* d_offsets);
pluginStatus_t allClassNMS(cudaStream_t stream, int num, int num_classes, int num_preds_per_class,
int top_k, float nms_threshold, bool share_location, bool isNormalized,
DataType DT_SCORE, DataType DT_BBOX, void* bbox_data,
void* beforeNMS_scores, void* beforeNMS_index_array,
void* afterNMS_scores, void* afterNMS_index_array, bool flipXY = false);
pluginStatus_t allClassRotatedNMS(cudaStream_t stream, int num, int num_classes,
int num_preds_per_class, int top_k, float nms_threshold,
bool share_location, bool isNormalized, DataType DT_SCORE,
DataType DT_BBOX, void* bbox_data, void* beforeNMS_scores,
void* beforeNMS_index_array, void* afterNMS_scores,
void* afterNMS_index_array, bool flipXY = false);
size_t detectionForwardBBoxDataSize(int N, int C1, DataType DT_BBOX);
size_t detectionForwardBBoxPermuteSize(bool shareLocation, int N, int C1, DataType DT_BBOX);
size_t sortScoresPerClassWorkspaceSize(int num, int num_classes, int num_preds_per_class,
DataType DT_CONF);
size_t sortScoresPerImageWorkspaceSize(int num_images, int num_items_per_image, DataType DT_SCORE);
pluginStatus_t sortScoresPerImage(cudaStream_t stream, int num_images, int num_items_per_image,
DataType DT_SCORE, void* unsorted_scores,
void* unsorted_bbox_indices, void* sorted_scores,
void* sorted_bbox_indices, void* workspace);
pluginStatus_t sortScoresPerClass(cudaStream_t stream, int num, int num_classes,
int num_preds_per_class, int background_label_id,
float confidence_threshold, DataType DT_SCORE,
void* conf_scores_gpu, void* index_array_gpu, void* workspace);
size_t calculateTotalWorkspaceSize(size_t* workspaces, int count);
pluginStatus_t permuteData(cudaStream_t stream, int nthreads, int num_classes, int num_data,
int num_dim, DataType DT_DATA, bool confSigmoid, const void* data,
void* new_data);
size_t detectionForwardPreNMSSize(int N, int C2);
size_t detectionForwardPostNMSSize(int N, int numClasses, int topK);
pluginStatus_t gatherNMSOutputs(cudaStream_t stream, bool shareLocation, int numImages,
int numPredsPerClass, int numClasses, int topK, int keepTopK,
DataType DT_BBOX, DataType DT_SCORE, const void* indices,
const void* scores, const void* bboxData, void* nmsedDets,
void* nmsedLabels, void* nmsedIndex = nullptr,
bool clipBoxes = true, bool rotated = false);
size_t detectionInferenceWorkspaceSize(bool shareLocation, int N, int C1, int C2, int numClasses,
int numPredsPerClass, int topK, DataType DT_BBOX,
DataType DT_SCORE);
#endif
// Copyright (c) OpenMMLab. All rights reserved.
#ifndef TRT_PLUGIN_BASE_HPP
#define TRT_PLUGIN_BASE_HPP
#include "NvInferRuntime.h"
#include "NvInferVersion.h"
#include "trt_plugin_helper.hpp"
namespace mmdeploy {
#if NV_TENSORRT_MAJOR > 7
#define TRT_NOEXCEPT noexcept
#else
#define TRT_NOEXCEPT
#endif
class TRTPluginBase : public nvinfer1::IPluginV2DynamicExt {
public:
TRTPluginBase(const std::string &name) : mLayerName(name) {}
// IPluginV2 Methods
const char *getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
int initialize() TRT_NOEXCEPT override { return STATUS_SUCCESS; }
void terminate() TRT_NOEXCEPT override {}
void destroy() TRT_NOEXCEPT override { delete this; }
void setPluginNamespace(const char *pluginNamespace) TRT_NOEXCEPT override {
mNamespace = pluginNamespace;
}
const char *getPluginNamespace() const TRT_NOEXCEPT override { return mNamespace.c_str(); }
virtual void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs,
const nvinfer1::DynamicPluginTensorDesc *out,
int nbOutputs) TRT_NOEXCEPT override {}
virtual size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
const nvinfer1::PluginTensorDesc *outputs,
int nbOutputs) const TRT_NOEXCEPT override {
return 0;
}
virtual void attachToContext(cudnnContext *cudnnContext, cublasContext *cublasContext,
nvinfer1::IGpuAllocator *gpuAllocator) TRT_NOEXCEPT override {}
virtual void detachFromContext() TRT_NOEXCEPT override {}
protected:
const std::string mLayerName;
std::string mNamespace;
#if NV_TENSORRT_MAJOR < 8
protected:
// To prevent compiler warnings.
using nvinfer1::IPluginV2DynamicExt::canBroadcastInputAcrossBatch;
using nvinfer1::IPluginV2DynamicExt::enqueue;
using nvinfer1::IPluginV2DynamicExt::getOutputDimensions;
using nvinfer1::IPluginV2DynamicExt::isOutputBroadcastAcrossBatch;
using nvinfer1::IPluginV2DynamicExt::supportsFormat;
#endif
};
class TRTPluginCreatorBase : public nvinfer1::IPluginCreator {
public:
const char *getPluginVersion() const TRT_NOEXCEPT override { return "1"; };
const nvinfer1::PluginFieldCollection *getFieldNames() TRT_NOEXCEPT override { return &mFC; }
void setPluginNamespace(const char *pluginNamespace) TRT_NOEXCEPT override {
mNamespace = pluginNamespace;
}
const char *getPluginNamespace() const TRT_NOEXCEPT override { return mNamespace.c_str(); }
protected:
nvinfer1::PluginFieldCollection mFC;
std::vector<nvinfer1::PluginField> mPluginAttributes;
std::string mNamespace;
};
} // namespace mmdeploy
#endif
// Copyright (c) OpenMMLab. All rights reserved.
#ifndef TRT_PLUGIN_HELPER_HPP
#define TRT_PLUGIN_HELPER_HPP
#include <cudnn.h>
#include <iostream>
#include <stdexcept>
#include "NvInferRuntime.h"
cudnnStatus_t convert_trt2cudnn_dtype(nvinfer1::DataType trt_dtype, cudnnDataType_t* cudnn_dtype);
// Enumerator for status
typedef enum {
STATUS_SUCCESS = 0,
STATUS_FAILURE = 1,
STATUS_BAD_PARAM = 2,
STATUS_NOT_SUPPORTED = 3,
STATUS_NOT_INITIALIZED = 4
} pluginStatus_t;
#define ASSERT(assertion) \
{ \
if (!(assertion)) { \
std::cerr << "#assertion" << __FILE__ << "," << __LINE__ << std::endl; \
abort(); \
} \
}
#define CUASSERT(status_) \
{ \
auto s_ = status_; \
if (s_ != cudaSuccess) { \
std::cerr << __FILE__ << ", " << __LINE__ << ", " << s_ << ", " << cudaGetErrorString(s_) \
<< std::endl; \
} \
}
#define CUBLASASSERT(status_) \
{ \
auto s_ = status_; \
if (s_ != CUBLAS_STATUS_SUCCESS) { \
std::cerr << __FILE__ << ", " << __LINE__ << ", " << s_ << std::endl; \
} \
}
#define CUERRORMSG(status_) \
{ \
auto s_ = status_; \
if (s_ != 0) std::cerr << __FILE__ << ", " << __LINE__ << ", " << s_ << std::endl; \
}
#ifndef DEBUG
#define CHECK(status) \
do { \
if (status != 0) abort(); \
} while (0)
#define ASSERT_PARAM(exp) \
do { \
if (!(exp)) return STATUS_BAD_PARAM; \
} while (0)
#define ASSERT_FAILURE(exp) \
do { \
if (!(exp)) return STATUS_FAILURE; \
} while (0)
#define CSC(call, err) \
do { \
cudaError_t cudaStatus = call; \
if (cudaStatus != cudaSuccess) { \
return err; \
} \
} while (0)
#define DEBUG_PRINTF(...) \
do { \
} while (0)
#else
#define ASSERT_PARAM(exp) \
do { \
if (!(exp)) { \
fprintf(stderr, "Bad param - " #exp ", %s:%d\n", __FILE__, __LINE__); \
return STATUS_BAD_PARAM; \
} \
} while (0)
#define ASSERT_FAILURE(exp) \
do { \
if (!(exp)) { \
fprintf(stderr, "Failure - " #exp ", %s:%d\n", __FILE__, __LINE__); \
return STATUS_FAILURE; \
} \
} while (0)
#define CSC(call, err) \
do { \
cudaError_t cudaStatus = call; \
if (cudaStatus != cudaSuccess) { \
printf("%s %d CUDA FAIL %s\n", __FILE__, __LINE__, cudaGetErrorString(cudaStatus)); \
return err; \
} \
} while (0)
#define CHECK(status) \
{ \
if (status != 0) { \
DEBUG_PRINTF("%s %d CUDA FAIL %s\n", __FILE__, __LINE__, cudaGetErrorString(status)); \
abort(); \
} \
}
#define DEBUG_PRINTF(...) \
do { \
printf(__VA_ARGS__); \
} while (0)
#endif
namespace mmdeploy {
const int MAXTENSORDIMS = 10;
struct TensorDesc {
int shape[MAXTENSORDIMS];
int stride[MAXTENSORDIMS];
int dim;
};
inline unsigned int getElementSize(nvinfer1::DataType t) {
switch (t) {
case nvinfer1::DataType::kINT32:
return 4;
case nvinfer1::DataType::kFLOAT:
return 4;
case nvinfer1::DataType::kHALF:
return 2;
// case nvinfer1::DataType::kBOOL:
case nvinfer1::DataType::kINT8:
return 1;
default:
throw std::runtime_error("Invalid DataType.");
}
throw std::runtime_error("Invalid DataType.");
return 0;
}
inline size_t getAlignedSize(size_t origin_size, size_t aligned_number = 16) {
return size_t((origin_size + aligned_number - 1) / aligned_number) * aligned_number;
}
} // namespace mmdeploy
#endif // TRT_PLUGIN_HELPER_HPP
// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
// Modified from:
// https://github.com/NVIDIA/TensorRT/blob/master/plugin/common/serialize.hpp
#ifndef TRT_SERIALIZE_HPP
#define TRT_SERIALIZE_HPP
#include <cassert>
#include <cstring>
#include <type_traits>
#include <vector>
template <typename T>
inline void serialize_value(void** buffer, T const& value);
template <typename T>
inline void deserialize_value(void const** buffer, size_t* buffer_size, T* value);
namespace {
template <typename T, class Enable = void>
struct Serializer {};
template <typename T>
struct Serializer<T,
typename std::enable_if<std::is_arithmetic<T>::value || std::is_enum<T>::value ||
std::is_pod<T>::value>::type> {
static size_t serialized_size(T const& value) { return sizeof(T); }
static void serialize(void** buffer, T const& value) {
::memcpy(*buffer, &value, sizeof(T));
reinterpret_cast<char*&>(*buffer) += sizeof(T);
}
static void deserialize(void const** buffer, size_t* buffer_size, T* value) {
assert(*buffer_size >= sizeof(T));
::memcpy(value, *buffer, sizeof(T));
reinterpret_cast<char const*&>(*buffer) += sizeof(T);
*buffer_size -= sizeof(T);
}
};
template <>
struct Serializer<const char*> {
static size_t serialized_size(const char* value) { return strlen(value) + 1; }
static void serialize(void** buffer, const char* value) {
::strcpy(static_cast<char*>(*buffer), value);
reinterpret_cast<char*&>(*buffer) += strlen(value) + 1;
}
static void deserialize(void const** buffer, size_t* buffer_size, const char** value) {
*value = static_cast<char const*>(*buffer);
size_t data_size = strnlen(*value, *buffer_size) + 1;
assert(*buffer_size >= data_size);
reinterpret_cast<char const*&>(*buffer) += data_size;
*buffer_size -= data_size;
}
};
template <typename T>
struct Serializer<std::vector<T>,
typename std::enable_if<std::is_arithmetic<T>::value || std::is_enum<T>::value ||
std::is_pod<T>::value>::type> {
static size_t serialized_size(std::vector<T> const& value) {
return sizeof(value.size()) + value.size() * sizeof(T);
}
static void serialize(void** buffer, std::vector<T> const& value) {
serialize_value(buffer, value.size());
size_t nbyte = value.size() * sizeof(T);
::memcpy(*buffer, value.data(), nbyte);
reinterpret_cast<char*&>(*buffer) += nbyte;
}
static void deserialize(void const** buffer, size_t* buffer_size, std::vector<T>* value) {
size_t size;
deserialize_value(buffer, buffer_size, &size);
value->resize(size);
size_t nbyte = value->size() * sizeof(T);
assert(*buffer_size >= nbyte);
::memcpy(value->data(), *buffer, nbyte);
reinterpret_cast<char const*&>(*buffer) += nbyte;
*buffer_size -= nbyte;
}
};
} // namespace
template <typename T>
inline size_t serialized_size(T const& value) {
return Serializer<T>::serialized_size(value);
}
template <typename T>
inline void serialize_value(void** buffer, T const& value) {
return Serializer<T>::serialize(buffer, value);
}
template <typename T>
inline void deserialize_value(void const** buffer, size_t* buffer_size, T* value) {
return Serializer<T>::deserialize(buffer, buffer_size, value);
}
#endif // TRT_SERIALIZE_HPP
// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
// modify from
// https://github.com/NVIDIA/TensorRT/tree/master/plugin/batchedNMSPlugin
#include <vector>
#include "nms/kernel.h"
const static int BS = 512;
template <typename T_BBOX>
__device__ T_BBOX bboxSize(const Bbox<T_BBOX> &bbox, const bool normalized, T_BBOX offset) {
if (bbox.xmax < bbox.xmin || bbox.ymax < bbox.ymin) {
// If bbox is invalid (e.g. xmax < xmin or ymax < ymin), return 0.
return 0;
} else {
T_BBOX width = bbox.xmax - bbox.xmin;
T_BBOX height = bbox.ymax - bbox.ymin;
if (normalized) {
return width * height;
} else {
// If bbox is not within range [0, 1].
return (width + offset) * (height + offset);
}
}
}
template <typename T_BBOX>
__device__ void intersectBbox(const Bbox<T_BBOX> &bbox1, const Bbox<T_BBOX> &bbox2,
Bbox<T_BBOX> *intersect_bbox) {
if (bbox2.xmin > bbox1.xmax || bbox2.xmax < bbox1.xmin || bbox2.ymin > bbox1.ymax ||
bbox2.ymax < bbox1.ymin) {
// Return [0, 0, 0, 0] if there is no intersection.
intersect_bbox->xmin = T_BBOX(0);
intersect_bbox->ymin = T_BBOX(0);
intersect_bbox->xmax = T_BBOX(0);
intersect_bbox->ymax = T_BBOX(0);
} else {
intersect_bbox->xmin = max(bbox1.xmin, bbox2.xmin);
intersect_bbox->ymin = max(bbox1.ymin, bbox2.ymin);
intersect_bbox->xmax = min(bbox1.xmax, bbox2.xmax);
intersect_bbox->ymax = min(bbox1.ymax, bbox2.ymax);
}
}
template <typename T_BBOX>
__device__ float jaccardOverlap(const Bbox<T_BBOX> &bbox1, const Bbox<T_BBOX> &bbox2,
const bool normalized, T_BBOX offset) {
Bbox<T_BBOX> intersect_bbox;
intersectBbox(bbox1, bbox2, &intersect_bbox);
float intersect_width, intersect_height;
if (normalized) {
intersect_width = intersect_bbox.xmax - intersect_bbox.xmin;
intersect_height = intersect_bbox.ymax - intersect_bbox.ymin;
} else {
intersect_width = intersect_bbox.xmax - intersect_bbox.xmin + offset;
intersect_height = intersect_bbox.ymax - intersect_bbox.ymin + offset;
}
if (intersect_width > 0 && intersect_height > 0) {
float intersect_size = intersect_width * intersect_height;
float bbox1_size = bboxSize(bbox1, normalized, offset);
float bbox2_size = bboxSize(bbox2, normalized, offset);
return intersect_size / (bbox1_size + bbox2_size - intersect_size);
} else {
return 0.;
}
}
/********** new NMS for only score and index array **********/
// clang-format off
template <typename T_SCORE, typename T_BBOX, int TSIZE>
__global__ void
#ifdef __CUDA_ARCH__
#if __CUDA_ARCH__ == 620 || __CUDA_ARCH__ == 530
__launch_bounds__(512)
#endif
#endif
allClassNMS_kernel(const int num, const int num_classes, const int num_preds_per_class,
const int top_k, const float nms_threshold, const bool share_location,
const bool isNormalized,
T_BBOX *bbox_data, // bbox_data should be float to preserve
// location information
T_SCORE *beforeNMS_scores, int *beforeNMS_index_array,
T_SCORE *afterNMS_scores, int *afterNMS_index_array, bool flipXY = false) {
// clang-format on
//__shared__ bool kept_bboxinfo_flag[CAFFE_CUDA_NUM_THREADS * TSIZE];
__shared__ bool kept_bboxinfo_flag[TSIZE * BS];
for (int i = 0; i < num; i++) {
const int offset = i * num_classes * num_preds_per_class + blockIdx.x * num_preds_per_class;
const int max_idx = offset + top_k; // put top_k bboxes into NMS calculation
const int bbox_idx_offset =
share_location ? (i * num_preds_per_class) : (i * num_classes * num_preds_per_class);
// local thread data
int loc_bboxIndex[TSIZE];
Bbox<T_BBOX> loc_bbox[TSIZE];
// initialize Bbox, Bboxinfo, kept_bboxinfo_flag
// Eliminate shared memory RAW hazard
__syncthreads();
#pragma unroll
for (int t = 0; t < TSIZE; t++) {
const int cur_idx = threadIdx.x + blockDim.x * t;
const int item_idx = offset + cur_idx;
if (item_idx < max_idx) {
loc_bboxIndex[t] = beforeNMS_index_array[item_idx];
if (loc_bboxIndex[t] >= 0)
// if (loc_bboxIndex[t] != -1)
{
const int bbox_data_idx = share_location
? (loc_bboxIndex[t] % num_preds_per_class + bbox_idx_offset)
: loc_bboxIndex[t];
loc_bbox[t].xmin =
flipXY ? bbox_data[bbox_data_idx * 4 + 1] : bbox_data[bbox_data_idx * 4 + 0];
loc_bbox[t].ymin =
flipXY ? bbox_data[bbox_data_idx * 4 + 0] : bbox_data[bbox_data_idx * 4 + 1];
loc_bbox[t].xmax =
flipXY ? bbox_data[bbox_data_idx * 4 + 3] : bbox_data[bbox_data_idx * 4 + 2];
loc_bbox[t].ymax =
flipXY ? bbox_data[bbox_data_idx * 4 + 2] : bbox_data[bbox_data_idx * 4 + 3];
kept_bboxinfo_flag[cur_idx] = true;
} else {
kept_bboxinfo_flag[cur_idx] = false;
}
} else {
kept_bboxinfo_flag[cur_idx] = false;
}
}
// filter out overlapped boxes with lower scores
int ref_item_idx = offset;
int ref_bbox_idx =
share_location
? (beforeNMS_index_array[ref_item_idx] % num_preds_per_class + bbox_idx_offset)
: beforeNMS_index_array[ref_item_idx];
while ((ref_bbox_idx != -1) && ref_item_idx < max_idx) {
Bbox<T_BBOX> ref_bbox;
ref_bbox.xmin = flipXY ? bbox_data[ref_bbox_idx * 4 + 1] : bbox_data[ref_bbox_idx * 4 + 0];
ref_bbox.ymin = flipXY ? bbox_data[ref_bbox_idx * 4 + 0] : bbox_data[ref_bbox_idx * 4 + 1];
ref_bbox.xmax = flipXY ? bbox_data[ref_bbox_idx * 4 + 3] : bbox_data[ref_bbox_idx * 4 + 2];
ref_bbox.ymax = flipXY ? bbox_data[ref_bbox_idx * 4 + 2] : bbox_data[ref_bbox_idx * 4 + 3];
// Eliminate shared memory RAW hazard
__syncthreads();
for (int t = 0; t < TSIZE; t++) {
const int cur_idx = threadIdx.x + blockDim.x * t;
const int item_idx = offset + cur_idx;
if ((kept_bboxinfo_flag[cur_idx]) && (item_idx > ref_item_idx)) {
// TODO: may need to add bool normalized as argument, HERE true means
// normalized
if (jaccardOverlap(ref_bbox, loc_bbox[t], isNormalized, T_BBOX(0)) > nms_threshold) {
kept_bboxinfo_flag[cur_idx] = false;
}
}
}
__syncthreads();
do {
ref_item_idx++;
} while (ref_item_idx < max_idx && !kept_bboxinfo_flag[ref_item_idx - offset]);
ref_bbox_idx =
share_location
? (beforeNMS_index_array[ref_item_idx] % num_preds_per_class + bbox_idx_offset)
: beforeNMS_index_array[ref_item_idx];
}
// store data
for (int t = 0; t < TSIZE; t++) {
const int cur_idx = threadIdx.x + blockDim.x * t;
const int read_item_idx = offset + cur_idx;
const int write_item_idx = (i * num_classes * top_k + blockIdx.x * top_k) + cur_idx;
/*
* If not not keeping the bbox
* Set the score to 0
* Set the bounding box index to -1
*/
if (read_item_idx < max_idx) {
afterNMS_scores[write_item_idx] =
kept_bboxinfo_flag[cur_idx] ? beforeNMS_scores[read_item_idx] : 0.0f;
afterNMS_index_array[write_item_idx] = kept_bboxinfo_flag[cur_idx] ? loc_bboxIndex[t] : -1;
}
}
}
}
template <typename T_SCORE, typename T_BBOX>
pluginStatus_t allClassNMS_gpu(cudaStream_t stream, const int num, const int num_classes,
const int num_preds_per_class, const int top_k,
const float nms_threshold, const bool share_location,
const bool isNormalized, void *bbox_data, void *beforeNMS_scores,
void *beforeNMS_index_array, void *afterNMS_scores,
void *afterNMS_index_array, bool flipXY = false) {
#define P(tsize) allClassNMS_kernel<T_SCORE, T_BBOX, (tsize)>
void (*kernel[10])(const int, const int, const int, const int, const float, const bool,
const bool, float *, T_SCORE *, int *, T_SCORE *, int *, bool) = {
P(1), P(2), P(3), P(4), P(5), P(6), P(7), P(8), P(9), P(10),
};
const int GS = num_classes;
const int t_size = (top_k + BS - 1) / BS;
ASSERT(t_size <= 10);
kernel[t_size - 1]<<<GS, BS, 0, stream>>>(
num, num_classes, num_preds_per_class, top_k, nms_threshold, share_location, isNormalized,
(T_BBOX *)bbox_data, (T_SCORE *)beforeNMS_scores, (int *)beforeNMS_index_array,
(T_SCORE *)afterNMS_scores, (int *)afterNMS_index_array, flipXY);
cudaError_t code = cudaGetLastError();
CUASSERT(code);
CSC(code, STATUS_FAILURE);
return STATUS_SUCCESS;
}
// allClassNMS LAUNCH CONFIG
typedef pluginStatus_t (*nmsFunc)(cudaStream_t, const int, const int, const int, const int,
const float, const bool, const bool, void *, void *, void *,
void *, void *, bool);
struct nmsLaunchConfigSSD {
DataType t_score;
DataType t_bbox;
nmsFunc function;
nmsLaunchConfigSSD(DataType t_score, DataType t_bbox) : t_score(t_score), t_bbox(t_bbox) {}
nmsLaunchConfigSSD(DataType t_score, DataType t_bbox, nmsFunc function)
: t_score(t_score), t_bbox(t_bbox), function(function) {}
bool operator==(const nmsLaunchConfigSSD &other) {
return t_score == other.t_score && t_bbox == other.t_bbox;
}
};
static std::vector<nmsLaunchConfigSSD> nmsFuncVec;
bool nmsInit() {
nmsFuncVec.push_back(
nmsLaunchConfigSSD(DataType::kFLOAT, DataType::kFLOAT, allClassNMS_gpu<float, float>));
return true;
}
static bool initialized = nmsInit();
pluginStatus_t allClassNMS(cudaStream_t stream, const int num, const int num_classes,
const int num_preds_per_class, const int top_k,
const float nms_threshold, const bool share_location,
const bool isNormalized, const DataType DT_SCORE, const DataType DT_BBOX,
void *bbox_data, void *beforeNMS_scores, void *beforeNMS_index_array,
void *afterNMS_scores, void *afterNMS_index_array, bool flipXY) {
nmsLaunchConfigSSD lc(DT_SCORE, DT_BBOX);
for (unsigned i = 0; i < nmsFuncVec.size(); ++i) {
if (lc == nmsFuncVec[i]) {
DEBUG_PRINTF("all class nms kernel %d\n", i);
return nmsFuncVec[i].function(stream, num, num_classes, num_preds_per_class, top_k,
nms_threshold, share_location, isNormalized, bbox_data,
beforeNMS_scores, beforeNMS_index_array, afterNMS_scores,
afterNMS_index_array, flipXY);
}
}
return STATUS_BAD_PARAM;
}
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
// modified from
// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_utils.h
#include <cmath>
#include <vector>
#include "nms/kernel.h"
template <typename T>
struct RotatedBox {
T x_ctr, y_ctr, w, h, a;
};
template <typename T>
struct Point {
T x, y;
__host__ __device__ __forceinline__ Point(const T &px = 0, const T &py = 0) : x(px), y(py) {}
__host__ __device__ __forceinline__ Point operator+(const Point &p) const {
return Point(x + p.x, y + p.y);
}
__host__ __device__ __forceinline__ Point &operator+=(const Point &p) {
x += p.x;
y += p.y;
return *this;
}
__host__ __device__ __forceinline__ Point operator-(const Point &p) const {
return Point(x - p.x, y - p.y);
}
__host__ __device__ __forceinline__ Point operator*(const T coeff) const {
return Point(x * coeff, y * coeff);
}
};
template <typename T>
__host__ __device__ __forceinline__ T dot_2d(const Point<T> &A, const Point<T> &B) {
return A.x * B.x + A.y * B.y;
}
template <typename T>
__host__ __device__ __forceinline__ T cross_2d(const Point<T> &A, const Point<T> &B) {
return A.x * B.y - B.x * A.y;
}
template <typename T>
__host__ __device__ __forceinline__ void get_rotated_vertices(const RotatedBox<T> &box,
Point<T> (&pts)[4]) {
// M_PI / 180. == 0.01745329251
// double theta = box.a * 0.01745329251;
// MODIFIED
double theta = box.a;
T cosTheta2 = (T)cos(theta) * 0.5f;
T sinTheta2 = (T)sin(theta) * 0.5f;
// y: top --> down; x: left --> right
pts[0].x = box.x_ctr - sinTheta2 * box.h - cosTheta2 * box.w;
pts[0].y = box.y_ctr + cosTheta2 * box.h - sinTheta2 * box.w;
pts[1].x = box.x_ctr + sinTheta2 * box.h - cosTheta2 * box.w;
pts[1].y = box.y_ctr - cosTheta2 * box.h - sinTheta2 * box.w;
pts[2].x = 2 * box.x_ctr - pts[0].x;
pts[2].y = 2 * box.y_ctr - pts[0].y;
pts[3].x = 2 * box.x_ctr - pts[1].x;
pts[3].y = 2 * box.y_ctr - pts[1].y;
}
template <typename T>
__host__ __device__ __forceinline__ int get_intersection_points(const Point<T> (&pts1)[4],
const Point<T> (&pts2)[4],
Point<T> (&intersections)[24]) {
// Line vector
// A line from p1 to p2 is: p1 + (p2-p1)*t, t=[0,1]
Point<T> vec1[4], vec2[4];
for (int i = 0; i < 4; i++) {
vec1[i] = pts1[(i + 1) % 4] - pts1[i];
vec2[i] = pts2[(i + 1) % 4] - pts2[i];
}
// Line test - test all line combos for intersection
int num = 0; // number of intersections
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 4; j++) {
// Solve for 2x2 Ax=b
T det = cross_2d<T>(vec2[j], vec1[i]);
// This takes care of parallel lines
if (fabs(det) <= 1e-14) {
continue;
}
auto vec12 = pts2[j] - pts1[i];
T t1 = cross_2d<T>(vec2[j], vec12) / det;
T t2 = cross_2d<T>(vec1[i], vec12) / det;
if (t1 >= 0.0f && t1 <= 1.0f && t2 >= 0.0f && t2 <= 1.0f) {
intersections[num++] = pts1[i] + vec1[i] * t1;
}
}
}
// Check for vertices of rect1 inside rect2
{
const auto &AB = vec2[0];
const auto &DA = vec2[3];
auto ABdotAB = dot_2d<T>(AB, AB);
auto ADdotAD = dot_2d<T>(DA, DA);
for (int i = 0; i < 4; i++) {
// assume ABCD is the rectangle, and P is the point to be judged
// P is inside ABCD iff. P's projection on AB lies within AB
// and P's projection on AD lies within AD
auto AP = pts1[i] - pts2[0];
auto APdotAB = dot_2d<T>(AP, AB);
auto APdotAD = -dot_2d<T>(AP, DA);
if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) && (APdotAD <= ADdotAD)) {
intersections[num++] = pts1[i];
}
}
}
// Reverse the check - check for vertices of rect2 inside rect1
{
const auto &AB = vec1[0];
const auto &DA = vec1[3];
auto ABdotAB = dot_2d<T>(AB, AB);
auto ADdotAD = dot_2d<T>(DA, DA);
for (int i = 0; i < 4; i++) {
auto AP = pts2[i] - pts1[0];
auto APdotAB = dot_2d<T>(AP, AB);
auto APdotAD = -dot_2d<T>(AP, DA);
if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) && (APdotAD <= ADdotAD)) {
intersections[num++] = pts2[i];
}
}
}
return num;
}
template <typename T>
__host__ __device__ __forceinline__ int convex_hull_graham(const Point<T> (&p)[24],
const int &num_in, Point<T> (&q)[24],
bool shift_to_zero = false) {
assert(num_in >= 2);
// Step 1:
// Find point with minimum y
// if more than 1 points have the same minimum y,
// pick the one with the minimum x.
int t = 0;
for (int i = 1; i < num_in; i++) {
if (p[i].y < p[t].y || (p[i].y == p[t].y && p[i].x < p[t].x)) {
t = i;
}
}
auto &start = p[t]; // starting point
// Step 2:
// Subtract starting point from every points (for sorting in the next step)
for (int i = 0; i < num_in; i++) {
q[i] = p[i] - start;
}
// Swap the starting point to position 0
auto tmp = q[0];
q[0] = q[t];
q[t] = tmp;
// Step 3:
// Sort point 1 ~ num_in according to their relative cross-product values
// (essentially sorting according to angles)
// If the angles are the same, sort according to their distance to origin
T dist[24];
for (int i = 0; i < num_in; i++) {
dist[i] = dot_2d<T>(q[i], q[i]);
}
for (int i = 1; i < num_in - 1; i++) {
for (int j = i + 1; j < num_in; j++) {
T crossProduct = cross_2d<T>(q[i], q[j]);
if ((crossProduct < -1e-6) || (fabs(crossProduct) < 1e-6 && dist[i] > dist[j])) {
auto q_tmp = q[i];
q[i] = q[j];
q[j] = q_tmp;
auto dist_tmp = dist[i];
dist[i] = dist[j];
dist[j] = dist_tmp;
}
}
}
// Step 4:
// Make sure there are at least 2 points (that don't overlap with each other)
// in the stack
int k; // index of the non-overlapped second point
for (k = 1; k < num_in; k++) {
if (dist[k] > 1e-8) {
break;
}
}
if (k == num_in) {
// We reach the end, which means the convex hull is just one point
q[0] = p[t];
return 1;
}
q[1] = q[k];
int m = 2; // 2 points in the stack
// Step 5:
// Finally we can start the scanning process.
// When a non-convex relationship between the 3 points is found
// (either concave shape or duplicated points),
// we pop the previous point from the stack
// until the 3-point relationship is convex again, or
// until the stack only contains two points
for (int i = k + 1; i < num_in; i++) {
while (m > 1 && cross_2d<T>(q[i] - q[m - 2], q[m - 1] - q[m - 2]) >= 0) {
m--;
}
q[m++] = q[i];
}
// Step 6 (Optional):
// In general sense we need the original coordinates, so we
// need to shift the points back (reverting Step 2)
// But if we're only interested in getting the area/perimeter of the shape
// We can simply return.
if (!shift_to_zero) {
for (int i = 0; i < m; i++) {
q[i] += start;
}
}
return m;
}
template <typename T>
__host__ __device__ __forceinline__ T polygon_area(const Point<T> (&q)[24], const int &m) {
if (m <= 2) {
return 0;
}
T area = 0;
for (int i = 1; i < m - 1; i++) {
area += fabs(cross_2d<T>(q[i] - q[0], q[i + 1] - q[0]));
}
return area / 2.0;
}
template <typename T>
__host__ __device__ __forceinline__ T rotated_boxes_intersection(const RotatedBox<T> &box1,
const RotatedBox<T> &box2) {
// There are up to 4 x 4 + 4 + 4 = 24 intersections (including dups) returned
// from rotated_rect_intersection_pts
Point<T> intersectPts[24], orderedPts[24];
Point<T> pts1[4];
Point<T> pts2[4];
get_rotated_vertices<T>(box1, pts1);
get_rotated_vertices<T>(box2, pts2);
int num = get_intersection_points<T>(pts1, pts2, intersectPts);
if (num <= 2) {
return 0.0;
}
// Convex Hull to order the intersection points in clockwise order and find
// the contour area.
int num_convex = convex_hull_graham<T>(intersectPts, num, orderedPts, true);
return polygon_area<T>(orderedPts, num_convex);
}
template <typename T>
__host__ __device__ __forceinline__ T single_box_iou_rotated(T const *const box1_raw,
T const *const box2_raw) {
// shift center to the middle point to achieve higher precision in result
RotatedBox<T> box1, box2;
auto center_shift_x = (box1_raw[0] + box2_raw[0]) / 2.0;
auto center_shift_y = (box1_raw[1] + box2_raw[1]) / 2.0;
box1.x_ctr = box1_raw[0] - center_shift_x;
box1.y_ctr = box1_raw[1] - center_shift_y;
box1.w = box1_raw[2];
box1.h = box1_raw[3];
box1.a = box1_raw[4];
box2.x_ctr = box2_raw[0] - center_shift_x;
box2.y_ctr = box2_raw[1] - center_shift_y;
box2.w = box2_raw[2];
box2.h = box2_raw[3];
box2.a = box2_raw[4];
const T area1 = box1.w * box1.h;
const T area2 = box2.w * box2.h;
if (area1 < 1e-14 || area2 < 1e-14) {
return 1.0f;
}
const T intersection = rotated_boxes_intersection<T>(box1, box2);
T baseS = 1.0;
baseS = (area1 + area2 - intersection);
const T iou = intersection / baseS;
return iou;
}
/********** new NMS for only score and index array **********/
template <typename T_SCORE, typename T_BBOX, int TSIZE>
__global__ void allClassRotatedNMS_kernel(const int num, const int num_classes,
const int num_preds_per_class, const int top_k,
const float nms_threshold, const bool share_location,
const bool isNormalized,
T_BBOX *bbox_data, // bbox_data should be float to
// preserve location information
T_SCORE *beforeNMS_scores, int *beforeNMS_index_array,
T_SCORE *afterNMS_scores, int *afterNMS_index_array) {
//__shared__ bool kept_bboxinfo_flag[CAFFE_CUDA_NUM_THREADS * TSIZE];
extern __shared__ bool kept_bboxinfo_flag[];
for (int i = 0; i < num; i++) {
const int offset = i * num_classes * num_preds_per_class + blockIdx.x * num_preds_per_class;
const int max_idx = offset + top_k; // put top_k bboxes into NMS calculation
const int bbox_idx_offset =
share_location ? (i * num_preds_per_class) : (i * num_classes * num_preds_per_class);
// local thread data
int loc_bboxIndex[TSIZE];
T_BBOX loc_bbox[TSIZE * 5];
// initialize Bbox, Bboxinfo, kept_bboxinfo_flag
// Eliminate shared memory RAW hazard
__syncthreads();
#pragma unroll
for (int t = 0; t < TSIZE; t++) {
const int cur_idx = threadIdx.x + blockDim.x * t;
const int item_idx = offset + cur_idx;
if (item_idx < max_idx) {
loc_bboxIndex[t] = beforeNMS_index_array[item_idx];
if (loc_bboxIndex[t] >= 0)
// if (loc_bboxIndex[t] != -1)
{
const int bbox_data_idx = share_location
? (loc_bboxIndex[t] % num_preds_per_class + bbox_idx_offset)
: loc_bboxIndex[t];
memcpy(&loc_bbox[t * 5], &bbox_data[bbox_data_idx * 5], 5 * sizeof(T_BBOX));
kept_bboxinfo_flag[cur_idx] = true;
} else {
kept_bboxinfo_flag[cur_idx] = false;
}
} else {
kept_bboxinfo_flag[cur_idx] = false;
}
}
// filter out overlapped boxes with lower scores
int ref_item_idx = offset;
int ref_bbox_idx =
share_location
? (beforeNMS_index_array[ref_item_idx] % num_preds_per_class + bbox_idx_offset)
: beforeNMS_index_array[ref_item_idx];
while ((ref_bbox_idx != -1) && ref_item_idx < max_idx) {
T_BBOX ref_bbox[5];
memcpy(&ref_bbox[0], &bbox_data[ref_bbox_idx * 5], 5 * sizeof(T_BBOX));
// Eliminate shared memory RAW hazard
__syncthreads();
for (int t = 0; t < TSIZE; t++) {
const int cur_idx = threadIdx.x + blockDim.x * t;
const int item_idx = offset + cur_idx;
if ((kept_bboxinfo_flag[cur_idx]) && (item_idx > ref_item_idx)) {
// TODO: may need to add bool normalized as argument, HERE true means
// normalized
if (single_box_iou_rotated(&ref_bbox[0], loc_bbox + t * 5) > nms_threshold) {
kept_bboxinfo_flag[cur_idx] = false;
}
}
}
__syncthreads();
do {
ref_item_idx++;
} while (ref_item_idx < max_idx && !kept_bboxinfo_flag[ref_item_idx - offset]);
ref_bbox_idx =
share_location
? (beforeNMS_index_array[ref_item_idx] % num_preds_per_class + bbox_idx_offset)
: beforeNMS_index_array[ref_item_idx];
}
// store data
for (int t = 0; t < TSIZE; t++) {
const int cur_idx = threadIdx.x + blockDim.x * t;
const int read_item_idx = offset + cur_idx;
const int write_item_idx = (i * num_classes * top_k + blockIdx.x * top_k) + cur_idx;
/*
* If not not keeping the bbox
* Set the score to 0
* Set the bounding box index to -1
*/
if (read_item_idx < max_idx) {
afterNMS_scores[write_item_idx] =
kept_bboxinfo_flag[cur_idx] ? beforeNMS_scores[read_item_idx] : 0.0f;
afterNMS_index_array[write_item_idx] = kept_bboxinfo_flag[cur_idx] ? loc_bboxIndex[t] : -1;
}
}
}
}
template <typename T_SCORE, typename T_BBOX>
pluginStatus_t allClassRotatedNMS_gpu(cudaStream_t stream, const int num, const int num_classes,
const int num_preds_per_class, const int top_k,
const float nms_threshold, const bool share_location,
const bool isNormalized, void *bbox_data,
void *beforeNMS_scores, void *beforeNMS_index_array,
void *afterNMS_scores, void *afterNMS_index_array) {
#define P(tsize) allClassRotatedNMS_kernel<T_SCORE, T_BBOX, (tsize)>
void (*kernel[10])(const int, const int, const int, const int, const float, const bool,
const bool, float *, T_SCORE *, int *, T_SCORE *, int *) = {
P(1), P(2), P(3), P(4), P(5), P(6), P(7), P(8), P(9), P(10),
};
const int BS = 512;
const int GS = num_classes;
const int t_size = (top_k + BS - 1) / BS;
ASSERT(t_size <= 10);
kernel[t_size - 1]<<<GS, BS, BS * t_size * sizeof(bool), stream>>>(
num, num_classes, num_preds_per_class, top_k, nms_threshold, share_location, isNormalized,
(T_BBOX *)bbox_data, (T_SCORE *)beforeNMS_scores, (int *)beforeNMS_index_array,
(T_SCORE *)afterNMS_scores, (int *)afterNMS_index_array);
CSC(cudaGetLastError(), STATUS_FAILURE);
return STATUS_SUCCESS;
}
// allClassNMS LAUNCH CONFIG
typedef pluginStatus_t (*rotatedNmsFunc)(cudaStream_t, const int, const int, const int, const int,
const float, const bool, const bool, void *, void *,
void *, void *, void *);
struct rotatedNmsLaunchConfig {
DataType t_score;
DataType t_bbox;
rotatedNmsFunc function;
rotatedNmsLaunchConfig(DataType t_score, DataType t_bbox) : t_score(t_score), t_bbox(t_bbox) {}
rotatedNmsLaunchConfig(DataType t_score, DataType t_bbox, rotatedNmsFunc function)
: t_score(t_score), t_bbox(t_bbox), function(function) {}
bool operator==(const rotatedNmsLaunchConfig &other) {
return t_score == other.t_score && t_bbox == other.t_bbox;
}
};
static std::vector<rotatedNmsLaunchConfig> rotatedNmsFuncVec;
bool rotatedNmsInit() {
rotatedNmsFuncVec.push_back(rotatedNmsLaunchConfig(DataType::kFLOAT, DataType::kFLOAT,
allClassRotatedNMS_gpu<float, float>));
return true;
}
static bool initialized = rotatedNmsInit();
pluginStatus_t allClassRotatedNMS(cudaStream_t stream, const int num, const int num_classes,
const int num_preds_per_class, const int top_k,
const float nms_threshold, const bool share_location,
const bool isNormalized, const DataType DT_SCORE,
const DataType DT_BBOX, void *bbox_data, void *beforeNMS_scores,
void *beforeNMS_index_array, void *afterNMS_scores,
void *afterNMS_index_array, bool) {
auto __cuda_arch__ = get_cuda_arch(0); // assume there is only one arch 7.2 device
if (__cuda_arch__ == 720 && top_k >= 1000) {
printf("Warning: pre_top_k need to be reduced for devices with arch 7.2, got pre_top_k=%d\n",
top_k);
}
rotatedNmsLaunchConfig lc(DT_SCORE, DT_BBOX);
for (unsigned i = 0; i < rotatedNmsFuncVec.size(); ++i) {
if (lc == rotatedNmsFuncVec[i]) {
DEBUG_PRINTF("all class rotated nms kernel %d\n", i);
return rotatedNmsFuncVec[i].function(stream, num, num_classes, num_preds_per_class, top_k,
nms_threshold, share_location, isNormalized, bbox_data,
beforeNMS_scores, beforeNMS_index_array, afterNMS_scores,
afterNMS_index_array);
}
}
return STATUS_BAD_PARAM;
}
// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
// modify from
// https://github.com/NVIDIA/TensorRT/tree/master/plugin/batchedNMSPlugin
#include "nms/batched_nms_kernel.hpp"
pluginStatus_t nmsInference(cudaStream_t stream, const int N, const int perBatchBoxesSize,
const int perBatchScoresSize, const bool shareLocation,
const int backgroundLabelId, const int numPredsPerClass,
const int numClasses, const int topK, const int keepTopK,
const float scoreThreshold, const float iouThreshold,
const DataType DT_BBOX, const void* locData, const DataType DT_SCORE,
const void* confData, void* nmsedDets, void* nmsedLabels,
void* nmsedIndex, void* workspace, bool isNormalized, bool confSigmoid,
bool clipBoxes, bool rotated) {
const int topKVal = topK < 0 ? numPredsPerClass : topK;
const int keepTopKVal = keepTopK < 0 ? numPredsPerClass : keepTopK;
// locCount = batch_size * number_boxes_per_sample * 4
const int locCount = N * perBatchBoxesSize;
/*
* shareLocation
* Bounding box are shared among all classes, i.e., a bounding box could be
* classified as any candidate class. Otherwise Bounding box are designed for
* specific classes, i.e., a bounding box could be classified as one certain
* class or not (binary classification).
*/
const int numLocClasses = shareLocation ? 1 : numClasses;
size_t bboxDataSize = detectionForwardBBoxDataSize(N, perBatchBoxesSize, DataType::kFLOAT);
void* bboxDataRaw = workspace;
cudaMemcpyAsync(bboxDataRaw, locData, bboxDataSize, cudaMemcpyDeviceToDevice, stream);
pluginStatus_t status;
/*
* bboxDataRaw format:
* [batch size, numPriors (per sample), numLocClasses, 4]
*/
// float for now
void* bboxData;
size_t bboxPermuteSize =
detectionForwardBBoxPermuteSize(shareLocation, N, perBatchBoxesSize, DataType::kFLOAT);
void* bboxPermute = nextWorkspacePtr((int8_t*)bboxDataRaw, bboxDataSize);
/*
* After permutation, bboxData format:
* [batch_size, numLocClasses, numPriors (per sample) (numPredsPerClass), 4]
* This is equivalent to swapping axis
*/
if (!shareLocation) {
status = permuteData(stream, locCount, numLocClasses, numPredsPerClass, rotated ? 5 : 4,
DataType::kFLOAT, false, bboxDataRaw, bboxPermute);
ASSERT_FAILURE(status == STATUS_SUCCESS);
bboxData = bboxPermute;
}
/*
* If shareLocation, numLocClasses = 1
* No need to permute data on linear memory
*/
else {
bboxData = bboxDataRaw;
}
/*
* Conf data format
* [batch size, numPriors * param.numClasses, 1, 1]
*/
const int numScores = N * perBatchScoresSize;
size_t totalScoresSize = detectionForwardPreNMSSize(N, perBatchScoresSize);
void* scores = nextWorkspacePtr((int8_t*)bboxPermute, bboxPermuteSize);
// need a conf_scores
/*
* After permutation, bboxData format:
* [batch_size, numClasses, numPredsPerClass, 1]
*/
status = permuteData(stream, numScores, numClasses, numPredsPerClass, 1, DataType::kFLOAT,
confSigmoid, confData, scores);
ASSERT_FAILURE(status == STATUS_SUCCESS);
size_t indicesSize = detectionForwardPreNMSSize(N, perBatchScoresSize);
void* indices = nextWorkspacePtr((int8_t*)scores, totalScoresSize);
size_t postNMSScoresSize = detectionForwardPostNMSSize(N, numClasses, topKVal);
size_t postNMSIndicesSize = detectionForwardPostNMSSize(N, numClasses, topKVal);
void* postNMSScores = nextWorkspacePtr((int8_t*)indices, indicesSize);
void* postNMSIndices = nextWorkspacePtr((int8_t*)postNMSScores, postNMSScoresSize);
void* sortingWorkspace = nextWorkspacePtr((int8_t*)postNMSIndices, postNMSIndicesSize);
// Sort the scores so that the following NMS could be applied.
status = sortScoresPerClass(stream, N, numClasses, numPredsPerClass, backgroundLabelId,
scoreThreshold, DataType::kFLOAT, scores, indices, sortingWorkspace);
ASSERT_FAILURE(status == STATUS_SUCCESS);
// This is set to true as the input bounding boxes are of the format [ymin,
// xmin, ymax, xmax]. The default implementation assumes [xmin, ymin, xmax,
// ymax]
bool flipXY = false;
// NMS
if (rotated) {
status = allClassRotatedNMS(stream, N, numClasses, numPredsPerClass, topKVal, iouThreshold,
shareLocation, isNormalized, DataType::kFLOAT, DataType::kFLOAT,
bboxData, scores, indices, postNMSScores, postNMSIndices, flipXY);
} else {
status = allClassNMS(stream, N, numClasses, numPredsPerClass, topKVal, iouThreshold,
shareLocation, isNormalized, DataType::kFLOAT, DataType::kFLOAT, bboxData,
scores, indices, postNMSScores, postNMSIndices, flipXY);
}
ASSERT_FAILURE(status == STATUS_SUCCESS);
// Sort the bounding boxes after NMS using scores
status = sortScoresPerImage(stream, N, numClasses * topKVal, DataType::kFLOAT, postNMSScores,
postNMSIndices, scores, indices, sortingWorkspace);
ASSERT_FAILURE(status == STATUS_SUCCESS);
// Gather data from the sorted bounding boxes after NMS
status = gatherNMSOutputs(stream, shareLocation, N, numPredsPerClass, numClasses, topKVal,
keepTopKVal, DataType::kFLOAT, DataType::kFLOAT, indices, scores,
bboxData, nmsedDets, nmsedLabels, nmsedIndex, clipBoxes, rotated);
ASSERT_FAILURE(status == STATUS_SUCCESS);
return STATUS_SUCCESS;
}
// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
// modify from
// https://github.com/NVIDIA/TensorRT/tree/master/plugin/batchedNMSPlugin
#include <vector>
#include "nms/kernel.h"
#include "trt_plugin_helper.hpp"
template <typename T_BBOX, typename T_SCORE, bool rotated, unsigned nthds_per_cta>
__launch_bounds__(nthds_per_cta) __global__
void gatherNMSOutputs_kernel(const bool shareLocation, const int numImages,
const int numPredsPerClass, const int numClasses, const int topK,
const int keepTopK, const int *indices, const T_SCORE *scores,
const T_BBOX *bboxData, T_BBOX *nmsedDets, int *nmsedLabels,
int *nmsedIndex, bool clipBoxes) {
if (keepTopK > topK) return;
for (int i = blockIdx.x * nthds_per_cta + threadIdx.x; i < numImages * keepTopK;
i += gridDim.x * nthds_per_cta) {
const int imgId = i / keepTopK;
const int detId = i % keepTopK;
const int offset = imgId * numClasses * topK;
const int index = indices[offset + detId];
const T_SCORE score = scores[offset + detId];
if (index == -1) {
nmsedLabels[i] = -1;
if (nmsedIndex != nullptr) {
nmsedIndex[i] = -1;
}
if (rotated) {
nmsedDets[i * 6] = 0;
nmsedDets[i * 6 + 1] = 0;
nmsedDets[i * 6 + 2] = 0;
nmsedDets[i * 6 + 3] = 0;
nmsedDets[i * 6 + 4] = 0;
nmsedDets[i * 6 + 5] = 0;
} else {
nmsedDets[i * 5] = 0;
nmsedDets[i * 5 + 1] = 0;
nmsedDets[i * 5 + 2] = 0;
nmsedDets[i * 5 + 3] = 0;
nmsedDets[i * 5 + 4] = 0;
}
} else {
const int bboxOffset =
imgId * (shareLocation ? numPredsPerClass : (numClasses * numPredsPerClass));
nmsedLabels[i] = (index % (numClasses * numPredsPerClass)) / numPredsPerClass; // label
if (rotated) {
const int bboxId = ((shareLocation ? (index % numPredsPerClass)
: index % (numClasses * numPredsPerClass)) +
bboxOffset) *
5;
if (nmsedIndex != nullptr) {
nmsedIndex[i] = bboxId / 5 - bboxOffset;
}
// clipped bbox xmin
nmsedDets[i * 6] =
clipBoxes ? max(min(bboxData[bboxId], T_BBOX(1.)), T_BBOX(0.)) : bboxData[bboxId];
// clipped bbox ymin
nmsedDets[i * 6 + 1] = clipBoxes ? max(min(bboxData[bboxId + 1], T_BBOX(1.)), T_BBOX(0.))
: bboxData[bboxId + 1];
// clipped bbox xmax
nmsedDets[i * 6 + 2] = clipBoxes ? max(min(bboxData[bboxId + 2], T_BBOX(1.)), T_BBOX(0.))
: bboxData[bboxId + 2];
// clipped bbox ymax
nmsedDets[i * 6 + 3] = clipBoxes ? max(min(bboxData[bboxId + 3], T_BBOX(1.)), T_BBOX(0.))
: bboxData[bboxId + 3];
// clipped bbox angle
nmsedDets[i * 6 + 4] = clipBoxes ? max(min(bboxData[bboxId + 4], T_BBOX(1.)), T_BBOX(0.))
: bboxData[bboxId + 4];
nmsedDets[i * 6 + 5] = score;
} else {
const int bboxId = ((shareLocation ? (index % numPredsPerClass)
: index % (numClasses * numPredsPerClass)) +
bboxOffset) *
4;
if (nmsedIndex != nullptr) {
nmsedIndex[i] = bboxId / 4 - bboxOffset;
}
// clipped bbox xmin
nmsedDets[i * 5] =
clipBoxes ? max(min(bboxData[bboxId], T_BBOX(1.)), T_BBOX(0.)) : bboxData[bboxId];
// clipped bbox ymin
nmsedDets[i * 5 + 1] = clipBoxes ? max(min(bboxData[bboxId + 1], T_BBOX(1.)), T_BBOX(0.))
: bboxData[bboxId + 1];
// clipped bbox xmax
nmsedDets[i * 5 + 2] = clipBoxes ? max(min(bboxData[bboxId + 2], T_BBOX(1.)), T_BBOX(0.))
: bboxData[bboxId + 2];
// clipped bbox ymax
nmsedDets[i * 5 + 3] = clipBoxes ? max(min(bboxData[bboxId + 3], T_BBOX(1.)), T_BBOX(0.))
: bboxData[bboxId + 3];
nmsedDets[i * 5 + 4] = score;
}
}
}
}
template <typename T_BBOX, typename T_SCORE, bool rotated>
pluginStatus_t gatherNMSOutputs_gpu(cudaStream_t stream, const bool shareLocation,
const int numImages, const int numPredsPerClass,
const int numClasses, const int topK, const int keepTopK,
const void *indices, const void *scores, const void *bboxData,
void *nmsedDets, void *nmsedLabels, void *nmsedIndex,
bool clipBoxes) {
const int BS = 32;
const int GS = 32;
gatherNMSOutputs_kernel<T_BBOX, T_SCORE, rotated, BS><<<GS, BS, 0, stream>>>(
shareLocation, numImages, numPredsPerClass, numClasses, topK, keepTopK, (int *)indices,
(T_SCORE *)scores, (T_BBOX *)bboxData, (T_BBOX *)nmsedDets, (int *)nmsedLabels,
(int *)nmsedIndex, clipBoxes);
CSC(cudaGetLastError(), STATUS_FAILURE);
return STATUS_SUCCESS;
}
// gatherNMSOutputs LAUNCH CONFIG {{{
typedef pluginStatus_t (*nmsOutFunc)(cudaStream_t, const bool, const int, const int, const int,
const int, const int, const void *, const void *, const void *,
void *, void *, void *, bool);
struct nmsOutLaunchConfig {
DataType t_bbox;
DataType t_score;
bool rotated;
nmsOutFunc function;
nmsOutLaunchConfig(DataType t_bbox, DataType t_score, bool rotated)
: t_bbox(t_bbox), t_score(t_score), rotated(rotated) {}
nmsOutLaunchConfig(DataType t_bbox, DataType t_score, bool rotated, nmsOutFunc function)
: t_bbox(t_bbox), t_score(t_score), rotated(rotated), function(function) {}
bool operator==(const nmsOutLaunchConfig &other) {
return t_bbox == other.t_bbox && t_score == other.t_score && rotated == other.rotated;
}
};
using nvinfer1::DataType;
static std::vector<nmsOutLaunchConfig> nmsOutFuncVec;
bool nmsOutputInit() {
nmsOutFuncVec.push_back(nmsOutLaunchConfig(DataType::kFLOAT, DataType::kFLOAT, false,
gatherNMSOutputs_gpu<float, float, false>));
nmsOutFuncVec.push_back(nmsOutLaunchConfig(DataType::kFLOAT, DataType::kFLOAT, true,
gatherNMSOutputs_gpu<float, float, true>));
return true;
}
static bool initialized = nmsOutputInit();
pluginStatus_t gatherNMSOutputs(cudaStream_t stream, const bool shareLocation, const int numImages,
const int numPredsPerClass, const int numClasses, const int topK,
const int keepTopK, const DataType DT_BBOX, const DataType DT_SCORE,
const void *indices, const void *scores, const void *bboxData,
void *nmsedDets, void *nmsedLabels, void *nmsedIndex,
bool clipBoxes, bool rotated) {
nmsOutLaunchConfig lc = nmsOutLaunchConfig(DT_BBOX, DT_SCORE, rotated);
for (unsigned i = 0; i < nmsOutFuncVec.size(); ++i) {
if (lc == nmsOutFuncVec[i]) {
DEBUG_PRINTF("gatherNMSOutputs kernel %d\n", i);
return nmsOutFuncVec[i].function(stream, shareLocation, numImages, numPredsPerClass,
numClasses, topK, keepTopK, indices, scores, bboxData,
nmsedDets, nmsedLabels, nmsedIndex, clipBoxes);
}
}
return STATUS_BAD_PARAM;
}
// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
// modify from
// https://github.com/NVIDIA/TensorRT/tree/master/plugin/batchedNMSPlugin
#include <stdint.h>
#include <cub/cub.cuh>
#include "cublas_v2.h"
#include "nms/kernel.h"
#include "trt_plugin_helper.hpp"
#define CUDA_MEM_ALIGN 256
// return cuda arch
size_t get_cuda_arch(int devID) {
int computeMode = -1, major = 0, minor = 0;
CUASSERT(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, devID));
CUASSERT(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devID));
CUASSERT(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devID));
return major * 100 + minor * 10;
}
// ALIGNPTR
int8_t *alignPtr(int8_t *ptr, uintptr_t to) {
uintptr_t addr = (uintptr_t)ptr;
if (addr % to) {
addr += to - addr % to;
}
return (int8_t *)addr;
}
// NEXTWORKSPACEPTR
int8_t *nextWorkspacePtr(int8_t *ptr, uintptr_t previousWorkspaceSize) {
uintptr_t addr = (uintptr_t)ptr;
addr += previousWorkspaceSize;
return alignPtr((int8_t *)addr, CUDA_MEM_ALIGN);
}
// CALCULATE TOTAL WORKSPACE SIZE
size_t calculateTotalWorkspaceSize(size_t *workspaces, int count) {
size_t total = 0;
for (int i = 0; i < count; i++) {
total += workspaces[i];
if (workspaces[i] % CUDA_MEM_ALIGN) {
total += CUDA_MEM_ALIGN - (workspaces[i] % CUDA_MEM_ALIGN);
}
}
return total;
}
using nvinfer1::DataType;
template <unsigned nthds_per_cta>
__launch_bounds__(nthds_per_cta) __global__
void setUniformOffsets_kernel(const int num_segments, const int offset, int *d_offsets) {
const int idx = blockIdx.x * nthds_per_cta + threadIdx.x;
if (idx <= num_segments) d_offsets[idx] = idx * offset;
}
void setUniformOffsets(cudaStream_t stream, const int num_segments, const int offset,
int *d_offsets) {
const int BS = 32;
const int GS = (num_segments + 1 + BS - 1) / BS;
setUniformOffsets_kernel<BS><<<GS, BS, 0, stream>>>(num_segments, offset, d_offsets);
}
size_t detectionForwardBBoxDataSize(int N, int C1, DataType DT_BBOX) {
if (DT_BBOX == DataType::kFLOAT) {
return N * C1 * sizeof(float);
}
printf("Only FP32 type bounding boxes are supported.\n");
return (size_t)-1;
}
size_t detectionForwardBBoxPermuteSize(bool shareLocation, int N, int C1, DataType DT_BBOX) {
if (DT_BBOX == DataType::kFLOAT) {
return shareLocation ? 0 : N * C1 * sizeof(float);
}
printf("Only FP32 type bounding boxes are supported.\n");
return (size_t)-1;
}
size_t detectionForwardPreNMSSize(int N, int C2) {
ASSERT(sizeof(float) == sizeof(int));
return N * C2 * sizeof(float);
}
size_t detectionForwardPostNMSSize(int N, int numClasses, int topK) {
ASSERT(sizeof(float) == sizeof(int));
return N * numClasses * topK * sizeof(float);
}
size_t detectionInferenceWorkspaceSize(bool shareLocation, int N, int C1, int C2, int numClasses,
int numPredsPerClass, int topK, DataType DT_BBOX,
DataType DT_SCORE) {
size_t wss[7];
wss[0] = detectionForwardBBoxDataSize(N, C1, DT_BBOX);
wss[1] = detectionForwardBBoxPermuteSize(shareLocation, N, C1, DT_BBOX);
wss[2] = detectionForwardPreNMSSize(N, C2);
wss[3] = detectionForwardPreNMSSize(N, C2);
wss[4] = detectionForwardPostNMSSize(N, numClasses, topK);
wss[5] = detectionForwardPostNMSSize(N, numClasses, topK);
wss[6] = std::max(sortScoresPerClassWorkspaceSize(N, numClasses, numPredsPerClass, DT_SCORE),
sortScoresPerImageWorkspaceSize(N, numClasses * topK, DT_SCORE));
return calculateTotalWorkspaceSize(wss, 7);
}
// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
// modify from
// https://github.com/NVIDIA/TensorRT/tree/master/plugin/batchedNMSPlugin
#include <vector>
#include "nms/kernel.h"
template <typename Dtype, unsigned nthds_per_cta>
__launch_bounds__(nthds_per_cta) __global__
void permuteData_kernel(const int nthreads, const int num_classes, const int num_data,
const int num_dim, bool confSigmoid, const Dtype *data,
Dtype *new_data) {
// data format: [batch_size, num_data, num_classes, num_dim]
for (int index = blockIdx.x * nthds_per_cta + threadIdx.x; index < nthreads;
index += nthds_per_cta * gridDim.x) {
const int i = index % num_dim;
const int c = (index / num_dim) % num_classes;
const int d = (index / num_dim / num_classes) % num_data;
const int n = index / num_dim / num_classes / num_data;
const int new_index = ((n * num_classes + c) * num_data + d) * num_dim + i;
float result = data[index];
if (confSigmoid) result = exp(result) / (1 + exp(result));
new_data[new_index] = result;
}
// new data format: [batch_size, num_classes, num_data, num_dim]
}
template <typename Dtype>
pluginStatus_t permuteData_gpu(cudaStream_t stream, const int nthreads, const int num_classes,
const int num_data, const int num_dim, bool confSigmoid,
const void *data, void *new_data) {
const int BS = 512;
const int GS = (nthreads + BS - 1) / BS;
permuteData_kernel<Dtype, BS><<<GS, BS, 0, stream>>>(nthreads, num_classes, num_data, num_dim,
confSigmoid, (const Dtype *)data,
(Dtype *)new_data);
CSC(cudaGetLastError(), STATUS_FAILURE);
return STATUS_SUCCESS;
}
// permuteData LAUNCH CONFIG
typedef pluginStatus_t (*pdFunc)(cudaStream_t, const int, const int, const int, const int, bool,
const void *, void *);
struct pdLaunchConfig {
DataType t_data;
pdFunc function;
pdLaunchConfig(DataType t_data) : t_data(t_data) {}
pdLaunchConfig(DataType t_data, pdFunc function) : t_data(t_data), function(function) {}
bool operator==(const pdLaunchConfig &other) { return t_data == other.t_data; }
};
static std::vector<pdLaunchConfig> pdFuncVec;
bool permuteDataInit() {
pdFuncVec.push_back(pdLaunchConfig(DataType::kFLOAT, permuteData_gpu<float>));
return true;
}
static bool initialized = permuteDataInit();
pluginStatus_t permuteData(cudaStream_t stream, const int nthreads, const int num_classes,
const int num_data, const int num_dim, const DataType DT_DATA,
bool confSigmoid, const void *data, void *new_data) {
pdLaunchConfig lc = pdLaunchConfig(DT_DATA);
for (unsigned i = 0; i < pdFuncVec.size(); ++i) {
if (lc == pdFuncVec[i]) {
DEBUG_PRINTF("permuteData kernel %d\n", i);
return pdFuncVec[i].function(stream, nthreads, num_classes, num_data, num_dim, confSigmoid,
data, new_data);
}
}
return STATUS_BAD_PARAM;
}
// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
// modify from
// https://github.com/NVIDIA/TensorRT/tree/master/plugin/batchedNMSPlugin
#include <vector>
#include "cub/cub.cuh"
#include "nms/cub_helper.h"
#include "nms/kernel.h"
#include "trt_plugin_helper.hpp"
template <typename T_SCORE, unsigned nthds_per_cta>
__launch_bounds__(nthds_per_cta) __global__
void prepareSortData(const int num, const int num_classes, const int num_preds_per_class,
const int background_label_id, const float confidence_threshold,
T_SCORE *conf_scores_gpu, T_SCORE *temp_scores, int *temp_idx,
int *d_offsets) {
// Prepare scores data for sort
const int cur_idx = blockIdx.x * nthds_per_cta + threadIdx.x;
const int numPredsPerBatch = num_classes * num_preds_per_class;
if (cur_idx < numPredsPerBatch) {
const int class_idx = cur_idx / num_preds_per_class;
for (int i = 0; i < num; i++) {
const int targetIdx = i * numPredsPerBatch + cur_idx;
const T_SCORE score = conf_scores_gpu[targetIdx];
// "Clear" background labeled score and index
// Because we do not care about background
if (class_idx == background_label_id) {
// Set scores to 0
// Set label = -1
temp_scores[targetIdx] = 0.0f;
temp_idx[targetIdx] = -1;
conf_scores_gpu[targetIdx] = 0.0f;
}
// "Clear" scores lower than threshold
else {
if (score > confidence_threshold) {
temp_scores[targetIdx] = score;
temp_idx[targetIdx] = cur_idx + i * numPredsPerBatch;
} else {
// Set scores to 0
// Set label = -1
temp_scores[targetIdx] = 0.0f;
temp_idx[targetIdx] = -1;
conf_scores_gpu[targetIdx] = 0.0f;
// TODO: HERE writing memory too many times
}
}
if ((cur_idx % num_preds_per_class) == 0) {
const int offset_ct = i * num_classes + cur_idx / num_preds_per_class;
d_offsets[offset_ct] = offset_ct * num_preds_per_class;
// set the last element in d_offset
if (blockIdx.x == 0 && threadIdx.x == 0)
d_offsets[num * num_classes] = num * numPredsPerBatch;
}
}
}
}
template <typename T_SCORE>
pluginStatus_t sortScoresPerClass_gpu(cudaStream_t stream, const int num, const int num_classes,
const int num_preds_per_class, const int background_label_id,
const float confidence_threshold, void *conf_scores_gpu,
void *index_array_gpu, void *workspace) {
const int num_segments = num * num_classes;
void *temp_scores = workspace;
const int arrayLen = num * num_classes * num_preds_per_class;
void *temp_idx = nextWorkspacePtr((int8_t *)temp_scores, arrayLen * sizeof(T_SCORE));
void *d_offsets = nextWorkspacePtr((int8_t *)temp_idx, arrayLen * sizeof(int));
size_t cubOffsetSize = (num_segments + 1) * sizeof(int);
void *cubWorkspace = nextWorkspacePtr((int8_t *)d_offsets, cubOffsetSize);
const int BS = 512;
const int GS = (num_classes * num_preds_per_class + BS - 1) / BS;
prepareSortData<T_SCORE, BS><<<GS, BS, 0, stream>>>(
num, num_classes, num_preds_per_class, background_label_id, confidence_threshold,
(T_SCORE *)conf_scores_gpu, (T_SCORE *)temp_scores, (int *)temp_idx, (int *)d_offsets);
size_t temp_storage_bytes = cubSortPairsWorkspaceSize<T_SCORE, int>(arrayLen, num_segments);
cub::DeviceSegmentedRadixSort::SortPairsDescending(
cubWorkspace, temp_storage_bytes, (const T_SCORE *)(temp_scores),
(T_SCORE *)(conf_scores_gpu), (const int *)(temp_idx), (int *)(index_array_gpu), arrayLen,
num_segments, (const int *)d_offsets, (const int *)d_offsets + 1, 0, sizeof(T_SCORE) * 8,
stream);
CSC(cudaGetLastError(), STATUS_FAILURE);
return STATUS_SUCCESS;
}
// sortScoresPerClass LAUNCH CONFIG
typedef pluginStatus_t (*sspcFunc)(cudaStream_t, const int, const int, const int, const int,
const float, void *, void *, void *);
struct sspcLaunchConfig {
DataType t_score;
sspcFunc function;
sspcLaunchConfig(DataType t_score) : t_score(t_score) {}
sspcLaunchConfig(DataType t_score, sspcFunc function) : t_score(t_score), function(function) {}
bool operator==(const sspcLaunchConfig &other) { return t_score == other.t_score; }
};
static std::vector<sspcLaunchConfig> sspcFuncVec;
bool sspcInit() {
sspcFuncVec.push_back(sspcLaunchConfig(DataType::kFLOAT, sortScoresPerClass_gpu<float>));
return true;
}
static bool initialized = sspcInit();
pluginStatus_t sortScoresPerClass(cudaStream_t stream, const int num, const int num_classes,
const int num_preds_per_class, const int background_label_id,
const float confidence_threshold, const DataType DT_SCORE,
void *conf_scores_gpu, void *index_array_gpu, void *workspace) {
sspcLaunchConfig lc = sspcLaunchConfig(DT_SCORE);
for (unsigned i = 0; i < sspcFuncVec.size(); ++i) {
if (lc == sspcFuncVec[i]) {
DEBUG_PRINTF("sortScoresPerClass kernel %d\n", i);
return sspcFuncVec[i].function(stream, num, num_classes, num_preds_per_class,
background_label_id, confidence_threshold, conf_scores_gpu,
index_array_gpu, workspace);
}
}
return STATUS_BAD_PARAM;
}
size_t sortScoresPerClassWorkspaceSize(const int num, const int num_classes,
const int num_preds_per_class, const DataType DT_CONF) {
size_t wss[4];
const int arrayLen = num * num_classes * num_preds_per_class;
wss[0] = arrayLen * mmdeploy::getElementSize(DT_CONF); // temp scores
wss[1] = arrayLen * sizeof(int); // temp indices
wss[2] = (num * num_classes + 1) * sizeof(int); // offsets
if (DT_CONF == DataType::kFLOAT) {
wss[3] = cubSortPairsWorkspaceSize<float, int>(arrayLen, num * num_classes); // cub workspace
} else {
printf("SCORE type not supported\n");
return (size_t)-1;
}
return calculateTotalWorkspaceSize(wss, 4);
}
// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
// modify from
// https://github.com/NVIDIA/TensorRT/tree/master/plugin/batchedNMSPlugin
#include <vector>
#include "cub/cub.cuh"
#include "nms/cub_helper.h"
#include "nms/kernel.h"
template <typename T_SCORE>
pluginStatus_t sortScoresPerImage_gpu(cudaStream_t stream, const int num_images,
const int num_items_per_image, void *unsorted_scores,
void *unsorted_bbox_indices, void *sorted_scores,
void *sorted_bbox_indices, void *workspace) {
void *d_offsets = workspace;
void *cubWorkspace = nextWorkspacePtr((int8_t *)d_offsets, (num_images + 1) * sizeof(int));
setUniformOffsets(stream, num_images, num_items_per_image, (int *)d_offsets);
const int arrayLen = num_images * num_items_per_image;
size_t temp_storage_bytes = cubSortPairsWorkspaceSize<T_SCORE, int>(arrayLen, num_images);
cub::DeviceSegmentedRadixSort::SortPairsDescending(
cubWorkspace, temp_storage_bytes, (const T_SCORE *)(unsorted_scores),
(T_SCORE *)(sorted_scores), (const int *)(unsorted_bbox_indices),
(int *)(sorted_bbox_indices), arrayLen, num_images, (const int *)d_offsets,
(const int *)d_offsets + 1, 0, sizeof(T_SCORE) * 8, stream);
CSC(cudaGetLastError(), STATUS_FAILURE);
return STATUS_SUCCESS;
}
// sortScoresPerImage LAUNCH CONFIG
typedef pluginStatus_t (*sspiFunc)(cudaStream_t, const int, const int, void *, void *, void *,
void *, void *);
struct sspiLaunchConfig {
DataType t_score;
sspiFunc function;
sspiLaunchConfig(DataType t_score) : t_score(t_score) {}
sspiLaunchConfig(DataType t_score, sspiFunc function) : t_score(t_score), function(function) {}
bool operator==(const sspiLaunchConfig &other) { return t_score == other.t_score; }
};
static std::vector<sspiLaunchConfig> sspiFuncVec;
bool sspiInit() {
sspiFuncVec.push_back(sspiLaunchConfig(DataType::kFLOAT, sortScoresPerImage_gpu<float>));
return true;
}
static bool initialized = sspiInit();
pluginStatus_t sortScoresPerImage(cudaStream_t stream, const int num_images,
const int num_items_per_image, const DataType DT_SCORE,
void *unsorted_scores, void *unsorted_bbox_indices,
void *sorted_scores, void *sorted_bbox_indices, void *workspace) {
sspiLaunchConfig lc = sspiLaunchConfig(DT_SCORE);
for (unsigned i = 0; i < sspiFuncVec.size(); ++i) {
if (lc == sspiFuncVec[i]) {
DEBUG_PRINTF("sortScoresPerImage kernel %d\n", i);
return sspiFuncVec[i].function(stream, num_images, num_items_per_image, unsorted_scores,
unsorted_bbox_indices, sorted_scores, sorted_bbox_indices,
workspace);
}
}
return STATUS_BAD_PARAM;
}
size_t sortScoresPerImageWorkspaceSize(const int num_images, const int num_items_per_image,
const DataType DT_SCORE) {
const int arrayLen = num_images * num_items_per_image;
size_t wss[2];
wss[0] = (num_images + 1) * sizeof(int); // offsets
if (DT_SCORE == DataType::kFLOAT) {
wss[1] = cubSortPairsWorkspaceSize<float, int>(arrayLen,
num_images); // cub workspace
} else {
printf("SCORE type not supported.\n");
return (size_t)-1;
}
return calculateTotalWorkspaceSize(wss, 2);
}
// Copyright (c) OpenMMLab. All rights reserved.
#include "common_cuda_helper.hpp"
#include "trt_plugin_helper.hpp"
using mmdeploy::TensorDesc;
template <class scalar_t>
__global__ void copy_permute_kernel(scalar_t *__restrict__ dst, const scalar_t *__restrict__ src,
int n, TensorDesc ts_src_stride, TensorDesc ts_dst_stride,
TensorDesc ts_permute) {
const int src_dim = ts_src_stride.dim;
const auto src_stride = ts_src_stride.stride;
const auto dst_stride = ts_dst_stride.stride;
const auto permute = ts_permute.shape;
CUDA_1D_KERNEL_LOOP(index, n) {
size_t dst_index = index;
size_t src_index = 0;
for (int i = 0; i < src_dim; ++i) {
int dim_index = dst_index / dst_stride[i];
dst_index = dst_index % dst_stride[i];
src_index += dim_index * src_stride[permute[i]];
}
dst[index] = src[src_index];
}
}
template <class scalar_t>
void memcpyPermute(scalar_t *dst, const scalar_t *src, int *src_size, int *permute, int src_dim,
cudaStream_t stream) {
size_t copy_size = 1;
TensorDesc ts_permute;
memcpy(&(ts_permute.shape[0]), permute, src_dim * sizeof(int));
TensorDesc ts_src_stride;
TensorDesc ts_dst_stride;
ts_src_stride.dim = src_dim;
ts_dst_stride.dim = src_dim;
int *src_stride = &(ts_src_stride.stride[0]);
int *dst_stride = &(ts_dst_stride.stride[0]);
int *dst_size = &(ts_dst_stride.shape[0]);
src_stride[src_dim - 1] = 1;
dst_stride[src_dim - 1] = 1;
for (int i = src_dim - 1; i >= 0; --i) {
dst_size[i] = src_size[permute[i]];
if (i < src_dim - 1) {
src_stride[i] = src_stride[i + 1] * src_size[i + 1];
}
}
for (int i = src_dim - 1; i >= 0; --i) {
copy_size *= dst_size[i];
if (i < src_dim - 1) {
dst_stride[i] = dst_stride[i + 1] * dst_size[i + 1];
}
}
copy_permute_kernel<scalar_t><<<GET_BLOCKS(copy_size), THREADS_PER_BLOCK, 0, stream>>>(
dst, src, copy_size, ts_src_stride, ts_dst_stride, ts_permute);
}
template void memcpyPermute<float>(float *dst, const float *src, int *src_size, int *permute,
int src_dim, cudaStream_t stream);
template void memcpyPermute<half>(half *dst, const half *src, int *src_size, int *permute,
int src_dim, cudaStream_t stream);
cudnnStatus_t convert_trt2cudnn_dtype(nvinfer1::DataType trt_dtype, cudnnDataType_t *cudnn_dtype) {
switch (trt_dtype) {
case nvinfer1::DataType::kFLOAT:
*cudnn_dtype = CUDNN_DATA_FLOAT;
break;
case nvinfer1::DataType::kHALF:
*cudnn_dtype = CUDNN_DATA_HALF;
break;
default:
return CUDNN_STATUS_BAD_PARAM;
}
return CUDNN_STATUS_SUCCESS;
}
template <>
cublasStatus_t cublasGemmWrap<float>(cublasHandle_t handle, cublasOperation_t transa,
cublasOperation_t transb, int m, int n, int k,
const float *alpha, const float *A, int lda, const float *B,
int ldb, const float *beta, float *C, int ldc) {
return cublasSgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
}
template <>
cublasStatus_t cublasGemmWrap<half>(cublasHandle_t handle, cublasOperation_t transa,
cublasOperation_t transb, int m, int n, int k,
const half *alpha, const half *A, int lda, const half *B,
int ldb, const half *beta, half *C, int ldc) {
return cublasHgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
}
// Copyright (c) OpenMMLab. All rights reserved
#include "trt_deform_conv.hpp"
#include <assert.h>
#include <chrono>
#include "trt_deform_conv_kernel.hpp"
#include "trt_serialize.hpp"
using namespace nvinfer1;
namespace mmdeploy {
namespace {
static const char *PLUGIN_VERSION{"1"};
static const char *PLUGIN_NAME{"MMCVDeformConv2d"};
} // namespace
DeformableConvPluginDynamic::DeformableConvPluginDynamic(const std::string &name,
const nvinfer1::Dims stride,
const nvinfer1::Dims padding,
const nvinfer1::Dims dilation,
const int deformableGroup, const int group)
: TRTPluginBase(name),
mStride(stride),
mPadding(padding),
mDilation(dilation),
mDeformableGroup(deformableGroup),
mGroup(group) {}
DeformableConvPluginDynamic::DeformableConvPluginDynamic(const std::string name, const void *data,
size_t length)
: TRTPluginBase(name) {
deserialize_value(&data, &length, &mStride);
deserialize_value(&data, &length, &mPadding);
deserialize_value(&data, &length, &mDilation);
deserialize_value(&data, &length, &mDeformableGroup);
deserialize_value(&data, &length, &mGroup);
}
DeformableConvPluginDynamic::~DeformableConvPluginDynamic() {}
nvinfer1::IPluginV2DynamicExt *DeformableConvPluginDynamic::clone() const TRT_NOEXCEPT {
DeformableConvPluginDynamic *plugin = new DeformableConvPluginDynamic(
mLayerName, mStride, mPadding, mDilation, mDeformableGroup, mGroup);
plugin->setPluginNamespace(getPluginNamespace());
return plugin;
}
nvinfer1::DimsExprs DeformableConvPluginDynamic::getOutputDimensions(
int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
nvinfer1::IExprBuilder &exprBuilder) TRT_NOEXCEPT {
// input[0] == input
// input[1] == offset
// input[2] == weight
nvinfer1::DimsExprs ret;
ret.nbDims = 4;
ret.d[0] = inputs[0].d[0];
ret.d[1] = inputs[2].d[0];
ret.d[2] = inputs[1].d[2];
ret.d[3] = inputs[1].d[3];
return ret;
}
bool DeformableConvPluginDynamic::supportsFormatCombination(
int pos, const nvinfer1::PluginTensorDesc *ioDesc, int nbInputs, int nbOutputs) TRT_NOEXCEPT {
if (pos == 0) {
return ((ioDesc[pos].type == nvinfer1::DataType::kFLOAT ||
ioDesc[pos].type == nvinfer1::DataType::kHALF) &&
ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR);
} else {
return ioDesc[pos].type == ioDesc[0].type && ioDesc[pos].format == ioDesc[0].format;
}
}
void DeformableConvPluginDynamic::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *inputs,
int nbInputs,
const nvinfer1::DynamicPluginTensorDesc *outputs,
int nbOutputs) TRT_NOEXCEPT {}
size_t DeformableConvPluginDynamic::getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs,
int nbInputs,
const nvinfer1::PluginTensorDesc *outputs,
int nbOutputs) const TRT_NOEXCEPT {
int sizeof_dtype = mmdeploy::getElementSize(outputs[0].type);
int batch_size = inputs[0].dims.d[0];
int nInputPlane = inputs[0].dims.d[1];
int inputHeight = inputs[0].dims.d[2];
int inputWidth = inputs[0].dims.d[3];
int nOutputPlane = outputs[0].dims.d[1];
int outputHeight = outputs[0].dims.d[2];
int outputWidth = outputs[0].dims.d[3];
int kW = inputs[2].dims.d[2];
int kH = inputs[2].dims.d[3];
int im2col_step = std::min(32, batch_size);
size_t col_size = mmdeploy::getAlignedSize(nInputPlane * kW * kH * im2col_step * outputHeight *
outputWidth * sizeof_dtype);
size_t out_size = 0;
if (im2col_step != 1)
out_size = mmdeploy::getAlignedSize(batch_size * nOutputPlane * outputHeight * outputWidth *
sizeof_dtype);
return col_size + out_size;
}
int DeformableConvPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
const nvinfer1::PluginTensorDesc *outputDesc,
const void *const *inputs, void *const *outputs,
void *workSpace, cudaStream_t stream) TRT_NOEXCEPT {
int batch = inputDesc[0].dims.d[0];
int channels = inputDesc[0].dims.d[1];
int height = inputDesc[0].dims.d[2];
int width = inputDesc[0].dims.d[3];
int channels_out = outputDesc[0].dims.d[1];
int kernel_h = inputDesc[2].dims.d[2];
int kernel_w = inputDesc[2].dims.d[3];
const void *x = inputs[0];
const void *offset = inputs[1];
const void *weight = inputs[2];
void *output = outputs[0];
int im2col_step = std::min(batch, 32);
auto data_type = inputDesc[0].type;
switch (data_type) {
case nvinfer1::DataType::kFLOAT:
deform_conv<float>((float *)x, (float *)weight, (float *)offset, (float *)output, workSpace,
batch, channels, height, width, channels_out, kernel_w, kernel_h,
mStride.d[0], mStride.d[1], mPadding.d[0], mPadding.d[1], mDilation.d[0],
mDilation.d[1], mGroup, mDeformableGroup, im2col_step, m_cublas_handle,
stream);
break;
case nvinfer1::DataType::kHALF:
deform_conv<half>((half *)x, (half *)weight, (half *)offset, (half *)output, workSpace, batch,
channels, height, width, channels_out, kernel_w, kernel_h, mStride.d[0],
mStride.d[1], mPadding.d[0], mPadding.d[1], mDilation.d[0], mDilation.d[1],
mGroup, mDeformableGroup, im2col_step, m_cublas_handle, stream);
break;
default:
return 1;
}
return 0;
}
nvinfer1::DataType DeformableConvPluginDynamic::getOutputDataType(
int index, const nvinfer1::DataType *inputTypes, int nbInputs) const TRT_NOEXCEPT {
return inputTypes[0];
}
// IPluginV2 Methods
const char *DeformableConvPluginDynamic::getPluginType() const TRT_NOEXCEPT { return PLUGIN_NAME; }
const char *DeformableConvPluginDynamic::getPluginVersion() const TRT_NOEXCEPT {
return PLUGIN_VERSION;
}
int DeformableConvPluginDynamic::getNbOutputs() const TRT_NOEXCEPT { return 1; }
size_t DeformableConvPluginDynamic::getSerializationSize() const TRT_NOEXCEPT {
return serialized_size(mStride) + serialized_size(mPadding) + serialized_size(mDilation) +
serialized_size(mDeformableGroup) + serialized_size(mGroup);
}
void DeformableConvPluginDynamic::serialize(void *buffer) const TRT_NOEXCEPT {
serialize_value(&buffer, mStride);
serialize_value(&buffer, mPadding);
serialize_value(&buffer, mDilation);
serialize_value(&buffer, mDeformableGroup);
serialize_value(&buffer, mGroup);
}
void DeformableConvPluginDynamic::attachToContext(
cudnnContext *cudnnContext, cublasContext *cublasContext,
nvinfer1::IGpuAllocator *gpuAllocator) TRT_NOEXCEPT {
m_cublas_handle = cublasContext;
}
void DeformableConvPluginDynamic::detachFromContext() TRT_NOEXCEPT {}
////////////////////// creator /////////////////////////////
DeformableConvPluginDynamicCreator::DeformableConvPluginDynamicCreator() {
mPluginAttributes.clear();
mPluginAttributes.emplace_back(nvinfer1::PluginField("stride"));
mPluginAttributes.emplace_back(nvinfer1::PluginField("padding"));
mPluginAttributes.emplace_back(nvinfer1::PluginField("dilation"));
mPluginAttributes.emplace_back(nvinfer1::PluginField("groups"));
mPluginAttributes.emplace_back(nvinfer1::PluginField("deform_groups"));
mFC.nbFields = mPluginAttributes.size();
mFC.fields = mPluginAttributes.data();
}
const char *DeformableConvPluginDynamicCreator::getPluginName() const TRT_NOEXCEPT {
return PLUGIN_NAME;
}
const char *DeformableConvPluginDynamicCreator::getPluginVersion() const TRT_NOEXCEPT {
return PLUGIN_VERSION;
}
nvinfer1::IPluginV2 *DeformableConvPluginDynamicCreator::createPlugin(
const char *name, const nvinfer1::PluginFieldCollection *fc) TRT_NOEXCEPT {
nvinfer1::Dims stride{2, {1, 1}};
nvinfer1::Dims padding{2, {0, 0}};
nvinfer1::Dims dilation{2, {1, 1}};
int deformableGroup = 1;
int group = 1;
for (int i = 0; i < fc->nbFields; i++) {
if (fc->fields[i].data == nullptr) {
continue;
}
std::string field_name(fc->fields[i].name);
if (field_name.compare("deform_groups") == 0) {
deformableGroup = static_cast<const int *>(fc->fields[i].data)[0];
}
if (field_name.compare("groups") == 0) {
group = static_cast<const int *>(fc->fields[i].data)[0];
}
if (field_name.compare("stride") == 0) {
stride.nbDims = 2;
stride.d[0] = static_cast<const int *>(fc->fields[i].data)[0];
stride.d[1] = static_cast<const int *>(fc->fields[i].data)[1];
}
if (field_name.compare("padding") == 0) {
padding.nbDims = 2;
padding.d[0] = static_cast<const int *>(fc->fields[i].data)[0];
padding.d[1] = static_cast<const int *>(fc->fields[i].data)[1];
}
if (field_name.compare("dilation") == 0) {
dilation.nbDims = 2;
dilation.d[0] = static_cast<const int *>(fc->fields[i].data)[0];
dilation.d[1] = static_cast<const int *>(fc->fields[i].data)[1];
}
}
DeformableConvPluginDynamic *plugin =
new DeformableConvPluginDynamic(name, stride, padding, dilation, deformableGroup, group);
plugin->setPluginNamespace(getPluginNamespace());
return plugin;
}
nvinfer1::IPluginV2 *DeformableConvPluginDynamicCreator::deserializePlugin(
const char *name, const void *serialData, size_t serialLength) TRT_NOEXCEPT {
auto plugin = new DeformableConvPluginDynamic(name, serialData, serialLength);
plugin->setPluginNamespace(getPluginNamespace());
return plugin;
}
REGISTER_TENSORRT_PLUGIN(DeformableConvPluginDynamicCreator);
} // namespace mmdeploy
// Copyright (c) OpenMMLab. All rights reserved.
#ifndef TRT_DEFORM_CONV_HPP
#define TRT_DEFORM_CONV_HPP
#include <cublas_v2.h>
#include <memory>
#include <string>
#include <vector>
#include "trt_plugin_base.hpp"
namespace mmdeploy {
class DeformableConvPluginDynamic : public TRTPluginBase {
public:
DeformableConvPluginDynamic(const std::string &name, const nvinfer1::Dims stride,
const nvinfer1::Dims padding, const nvinfer1::Dims dilation,
const int deformableGroup, const int group);
DeformableConvPluginDynamic(const std::string name, const void *data, size_t length);
DeformableConvPluginDynamic() = delete;
~DeformableConvPluginDynamic() TRT_NOEXCEPT override;
// IPluginV2DynamicExt Methods
nvinfer1::IPluginV2DynamicExt *clone() const TRT_NOEXCEPT override;
nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs,
int nbInputs, nvinfer1::IExprBuilder &exprBuilder)
TRT_NOEXCEPT override;
bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *ioDesc, int nbInputs,
int nbOutputs) TRT_NOEXCEPT override;
void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs,
const nvinfer1::DynamicPluginTensorDesc *out,
int nbOutputs) TRT_NOEXCEPT override;
size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
const nvinfer1::PluginTensorDesc *outputs,
int nbOutputs) const TRT_NOEXCEPT override;
int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
void *const *outputs, void *workspace, cudaStream_t stream) TRT_NOEXCEPT override;
void attachToContext(cudnnContext *cudnnContext, cublasContext *cublasContext,
nvinfer1::IGpuAllocator *gpuAllocator) TRT_NOEXCEPT override;
void detachFromContext() TRT_NOEXCEPT override;
// IPluginV2Ext Methods
nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType *inputTypes,
int nbInputs) const TRT_NOEXCEPT override;
// IPluginV2 Methods
const char *getPluginType() const TRT_NOEXCEPT override;
const char *getPluginVersion() const TRT_NOEXCEPT override;
int getNbOutputs() const TRT_NOEXCEPT override;
size_t getSerializationSize() const TRT_NOEXCEPT override;
void serialize(void *buffer) const TRT_NOEXCEPT override;
private:
nvinfer1::Dims mStride;
nvinfer1::Dims mPadding;
nvinfer1::Dims mDilation;
int mDeformableGroup;
int mGroup;
cublasHandle_t m_cublas_handle;
};
class DeformableConvPluginDynamicCreator : public TRTPluginCreatorBase {
public:
DeformableConvPluginDynamicCreator();
const char *getPluginName() const TRT_NOEXCEPT override;
const char *getPluginVersion() const TRT_NOEXCEPT override;
nvinfer1::IPluginV2 *createPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
TRT_NOEXCEPT override;
nvinfer1::IPluginV2 *deserializePlugin(const char *name, const void *serialData,
size_t serialLength) TRT_NOEXCEPT override;
};
} // namespace mmdeploy
#endif // TRT_DEFORM_CONV_HPP
/*!
******************* BEGIN Caffe Copyright Notice and Disclaimer
*****************
*
* COPYRIGHT
*
* All contributions by the University of California:
* Copyright (c) 2014-2017 The Regents of the University of California (Regents)
* All rights reserved.
*
* All other contributions:
* Copyright (c) 2014-2017, the respective contributors
* All rights reserved.
*
* Caffe uses a shared copyright model: each contributor holds copyright over
* their contributions to Caffe. The project versioning records all such
* contribution and copyright details. If a contributor wants to further mark
* their specific copyright on a particular contribution, they should indicate
* their copyright solely in the commit message of the change when it is
* committed.
*
* LICENSE
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
*this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
*AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
*IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
*FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
*DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
*SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
*CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
*OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
*OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* CONTRIBUTION AGREEMENT
*
* By contributing to the BVLC/caffe repository through pull-request, comment,
* or otherwise, the contributor releases their content to the
* license and copyright terms herein.
*
***************** END Caffe Copyright Notice and Disclaimer
*********************
*
* Copyright (c) 2018 Microsoft
* Licensed under The MIT License [see LICENSE for details]
* \file modulated_deformable_im2col.cuh
* \brief Function definitions of converting an image to
* column matrix based on kernel, padding, dilation, and offset.
* These functions are mainly used in deformable convolution operators.
* \ref: https://arxiv.org/abs/1703.06211
* \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng
*/
// modified from
// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu
#include "common_cuda_helper.hpp"
#include "trt_deform_conv_kernel.cuh"
#include "trt_deform_conv_kernel.hpp"
#include "trt_plugin_helper.hpp"
template <typename scalar_t>
void deform_conv_im2col(const scalar_t* input, const scalar_t* offset, scalar_t* column,
const int channels, const int height, const int width, const int ksize_h,
const int ksize_w, const int pad_h, const int pad_w, const int stride_h,
const int stride_w, const int dilation_h, const int dilation_w,
const int parallel_imgs, const int deformable_group, cudaStream_t stream) {
int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
int num_kernels = channels * height_col * width_col * parallel_imgs;
int channel_per_deformable_group = channels / deformable_group;
deformable_im2col_gpu_kernel<scalar_t><<<GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>(
num_kernels, input, offset, height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, channel_per_deformable_group, parallel_imgs, channels,
deformable_group, height_col, width_col, column);
cudaCheckError();
}
template <typename scalar_t>
void deform_conv(const scalar_t* input, const scalar_t* weight, const scalar_t* offset,
scalar_t* output, void* workspace, int batchSize, int nInputPlane, int inputHeight,
int inputWidth, int nOutputPlane, int kW, int kH, int dW, int dH, int padW,
int padH, int dilationW, int dilationH, int group, int deformable_group,
int im2col_step, cublasHandle_t cublas_handle, cudaStream_t stream) {
size_t word_size = sizeof(scalar_t);
im2col_step = std::min(int(batchSize), im2col_step);
long outputWidth = (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
long outputHeight = (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
long outputHW = outputHeight * outputWidth;
long kHW = kH * kW;
long columns_size =
mmdeploy::getAlignedSize(nInputPlane * kHW * im2col_step * outputHW * word_size);
// column buffer for img2col
char* workspace_ptr = reinterpret_cast<char*>(workspace);
scalar_t* columns = reinterpret_cast<scalar_t*>(workspace_ptr);
workspace_ptr = workspace_ptr + columns_size;
scalar_t* output_buffer;
if (im2col_step == 1) {
output_buffer = output;
} else {
// output need permute when im2col_step!=1
output_buffer = reinterpret_cast<scalar_t*>(workspace_ptr);
}
long input_elt_step = im2col_step * nInputPlane * inputHeight * inputWidth;
long offset_elt_step = im2col_step * deformable_group * 2 * kHW * outputHW;
long out_buffer_step = nOutputPlane * im2col_step * outputHW;
long col_g_step = nInputPlane * kHW * im2col_step * outputHW / group;
long weight_g_step = nOutputPlane * nInputPlane * kHW / (group * group);
long out_buffer_g_step = out_buffer_step / group;
int m = nOutputPlane / group;
int n = im2col_step * outputHW;
int k = nInputPlane * kHW / group;
scalar_t alpha = 1.f;
scalar_t beta = 0.f;
for (int elt = 0; elt < batchSize / im2col_step; elt++) {
const scalar_t* input_start = input + elt * input_elt_step;
const scalar_t* offset_start = offset + elt * offset_elt_step;
deform_conv_im2col<scalar_t>(input_start, offset_start, columns, nInputPlane, inputHeight,
inputWidth, kH, kW, padH, padW, dH, dW, dilationH, dilationW,
im2col_step, deformable_group, stream);
for (int g = 0; g < group; ++g) {
const scalar_t* weight_start = weight + g * weight_g_step;
scalar_t* col_start = columns + g * col_g_step;
scalar_t* out_buffer_start = output_buffer + elt * out_buffer_step + g * out_buffer_g_step;
cublasGemmWrap<scalar_t>(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &alpha, col_start,
n, weight_start, k, &beta, out_buffer_start, n);
cudaCheckError();
}
}
if (im2col_step != 1) {
int output_buffer_shape[5] = {batchSize / im2col_step, nOutputPlane, im2col_step,
static_cast<int>(outputHeight), static_cast<int>(outputWidth)};
int output_buffer_permute[5] = {0, 2, 1, 3, 4};
memcpyPermute<scalar_t>(output, output_buffer, &output_buffer_shape[0],
&output_buffer_permute[0], 5, stream);
}
}
template void deform_conv<float>(const float* input, const float* weight, const float* offset,
float* output, void* workspace, int batchSize, int nInputPlane,
int inputHeight, int inputWidth, int nOutputPlane, int kW, int kH,
int dW, int dH, int padW, int padH, int dilationW, int dilationH,
int group, int deformable_group, int im2col_step,
cublasHandle_t cublas_handle, cudaStream_t stream);
template void deform_conv<__half>(const __half* input, const __half* weight, const __half* offset,
__half* output, void* workspace, int batchSize, int nInputPlane,
int inputHeight, int inputWidth, int nOutputPlane, int kW, int kH,
int dW, int dH, int padW, int padH, int dilationW, int dilationH,
int group, int deformable_group, int im2col_step,
cublasHandle_t cublas_handle, cudaStream_t stream);
/*!
******************* BEGIN Caffe Copyright Notice and Disclaimer
*****************
*
* COPYRIGHT
*
* All contributions by the University of California:
* Copyright (c) 2014-2017 The Regents of the University of California (Regents)
* All rights reserved.
*
* All other contributions:
* Copyright (c) 2014-2017, the respective contributors
* All rights reserved.
*
* Caffe uses a shared copyright model: each contributor holds copyright over
* their contributions to Caffe. The project versioning records all such
* contribution and copyright details. If a contributor wants to further mark
* their specific copyright on a particular contribution, they should indicate
* their copyright solely in the commit message of the change when it is
* committed.
*
* LICENSE
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
*this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
*AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
*IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
*FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
*DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
*SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
*CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
*OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
*OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* CONTRIBUTION AGREEMENT
*
* By contributing to the BVLC/caffe repository through pull-request, comment,
* or otherwise, the contributor releases their content to the
* license and copyright terms herein.
*
***************** END Caffe Copyright Notice and Disclaimer
*********************
*
* Copyright (c) 2018 Microsoft
* Licensed under The MIT License [see LICENSE for details]
* \file modulated_deformable_im2col.cuh
* \brief Function definitions of converting an image to
* column matrix based on kernel, padding, dilation, and offset.
* These functions are mainly used in deformable convolution operators.
* \ref: https://arxiv.org/abs/1703.06211
* \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng
*/
// modified from
// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu
#include <cuda_fp16.h>
#include "common_cuda_helper.hpp"
template <typename scalar_t>
__device__ __forceinline__ scalar_t deformable_im2col_bilinear(const scalar_t* __restrict__ input,
const int height, const int width,
float h, float w) {
if (h <= -1 || height <= h || w <= -1 || width <= w) {
return 0;
}
const int h_low = floorf(h);
const int w_low = floorf(w);
input += h_low * width;
const scalar_t v1 = (h_low >= 0 && w_low >= 0) ? input[w_low] : static_cast<scalar_t>(0.0f);
const int w_high = w_low + 1;
const scalar_t v2 =
(h_low >= 0 && w_high <= width - 1) ? input[w_high] : static_cast<scalar_t>(0.0f);
const scalar_t lw = w - w_low;
const scalar_t v_low = fmaf(v2 - v1, lw, v1);
input += width;
const scalar_t v3 =
(h_low <= height - 2 && w_low >= 0) ? input[w_low] : static_cast<scalar_t>(0.0f);
const scalar_t v4 =
(h_low <= height - 2 && w_high <= width - 1) ? input[w_high] : static_cast<scalar_t>(0.0f);
const scalar_t v_high = fmaf(v4 - v3, lw, v3);
const scalar_t lh = h - h_low;
const scalar_t val = fmaf(v_high - v_low, lh, v_low);
return val;
}
template <>
__device__ __forceinline__ __half deformable_im2col_bilinear(const __half* __restrict__ input,
const int height, const int width,
float h, float w) {
if (h <= -1 || height <= h || w <= -1 || width <= w) {
return 0;
}
const int h_low = floorf(h);
const int w_low = floorf(w);
input += h_low * width;
const float v1 = (h_low >= 0 && w_low >= 0) ? __half2float(input[w_low]) : 0.0f;
const int w_high = w_low + 1;
const float v2 = (h_low >= 0 && w_high <= width - 1) ? __half2float(input[w_high]) : 0.0f;
const float lw = w - w_low;
const float v_low = fmaf(v2 - v1, lw, v1);
input += width;
const float v3 = (h_low <= height - 2 && w_low >= 0) ? __half2float(input[w_low]) : 0.0f;
const float v4 =
(h_low <= height - 2 && w_high <= width - 1) ? __half2float(input[w_high]) : 0.0f;
const float v_high = fmaf(v4 - v3, lw, v3);
const float lh = h - h_low;
const float val = fmaf(v_high - v_low, lh, v_low);
return __float2half(val);
}
template <typename scalar_t>
__global__ void deformable_im2col_gpu_kernel(
const int n, const scalar_t* __restrict__ data_im, const scalar_t* __restrict__ data_offset,
const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h,
const int pad_w, const int stride_h, const int stride_w, const int dilation_h,
const int dilation_w, const int channel_per_deformable_group, const int batch_size,
const int num_channels, const int deformable_group, const int height_col, const int width_col,
scalar_t* __restrict__ data_col) {
const int hw_col = height_col * width_col;
const int data_col_step = batch_size * hw_col;
CUDA_1D_KERNEL_LOOP(index, n) {
// index index of output matrix
int tmp_index = index;
const int w_col = tmp_index % width_col;
tmp_index /= width_col;
const int h_col = tmp_index % height_col;
tmp_index /= height_col;
const int b_col = tmp_index % batch_size;
const int c_im = tmp_index / batch_size;
const int c_col = c_im * kernel_h * kernel_w;
// compute deformable group index
const int deformable_group_index = c_im / channel_per_deformable_group;
const int h_in = h_col * stride_h - pad_h;
const int w_in = w_col * stride_w - pad_w;
scalar_t* __restrict__ data_col_ptr = data_col + c_col * data_col_step + index % data_col_step;
const scalar_t* __restrict__ data_im_ptr =
data_im + (b_col * num_channels + c_im) * height * width;
const scalar_t* __restrict__ data_offset_ptr =
data_offset +
((b_col * deformable_group + deformable_group_index) << 1) * kernel_h * kernel_w * hw_col +
h_col * width_col + w_col;
for (int i = 0; i < kernel_h; ++i) {
for (int j = 0; j < kernel_w; ++j) {
const int data_offset_h = (i * kernel_w + j) * hw_col << 1;
const scalar_t offset_h = data_offset_ptr[data_offset_h];
const int data_offset_w = data_offset_h + hw_col;
const scalar_t offset_w = data_offset_ptr[data_offset_w];
const scalar_t h_im = h_in + i * dilation_h + (float)offset_h;
const scalar_t w_im = w_in + j * dilation_w + (float)offset_w;
const scalar_t val = deformable_im2col_bilinear(data_im_ptr, height, width, h_im, w_im);
*data_col_ptr = val;
data_col_ptr += data_col_step;
}
}
}
}
// Copyright (c) OpenMMLab. All rights reserved
#ifndef TRT_DEFORM_CONV_KERNEL_HPP
#define TRT_DEFORM_CONV_KERNEL_HPP
#include <cublas_v2.h>
#include <cuda_runtime.h>
template <typename scalar_t>
void deform_conv_im2col(const scalar_t* input, const scalar_t* offset, scalar_t* column,
const int channels, const int height, const int width, const int ksize_h,
const int ksize_w, const int pad_h, const int pad_w, const int stride_h,
const int stride_w, const int dilation_h, const int dilation_w,
const int parallel_imgs, const int deformable_group, cudaStream_t stream);
template <typename scalar_t>
void deform_conv(const scalar_t* input, const scalar_t* weight, const scalar_t* offset,
scalar_t* output, void* workspace, int batchSize, int nInputPlane, int inputHeight,
int inputWidth, int nOutputPlane, int kW, int kH, int dW, int dH, int padW,
int padH, int dilationW, int dilationH, int group, int deformable_group,
int im2col_step, cublasHandle_t cublas_handle, cudaStream_t stream);
#endif // TRT_DEFORM_CONV_KERNEL_HPP
// Copyright (c) OpenMMLab. All rights reserved.
#include "gather_topk.hpp"
#include <assert.h>
#include <stdio.h>
#include <chrono>
#include "NvInferVersion.h"
#include "gather_topk_kernel.hpp"
#include "trt_serialize.hpp"
namespace mmdeploy {
namespace {
static const char *PLUGIN_VERSION{"1"};
static const char *PLUGIN_NAME{"GatherTopk"};
} // namespace
GatherTopk::GatherTopk(const std::string &name) : TRTPluginBase(name) {}
GatherTopk::GatherTopk(const std::string name, const void *data, size_t length)
: TRTPluginBase(name) {}
nvinfer1::IPluginV2DynamicExt *GatherTopk::clone() const TRT_NOEXCEPT {
GatherTopk *plugin = new GatherTopk(mLayerName);
plugin->setPluginNamespace(getPluginNamespace());
return plugin;
}
nvinfer1::DimsExprs GatherTopk::getOutputDimensions(
int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
nvinfer1::IExprBuilder &exprBuilder) TRT_NOEXCEPT {
assert(inputs[0].nbDims >= inputs[1].nbDims);
nvinfer1::DimsExprs ret;
ret.nbDims = inputs[0].nbDims;
for (int i = 0; i < inputs[1].nbDims; ++i) {
ret.d[i] = inputs[1].d[i];
}
for (int i = inputs[1].nbDims; i < inputs[0].nbDims; ++i) {
ret.d[i] = inputs[0].d[i];
}
return ret;
}
bool GatherTopk::supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *ioDesc,
int nbInputs, int nbOutputs) TRT_NOEXCEPT {
switch (pos) {
case 0:
// data
return (ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR) ||
(ioDesc[pos].type == nvinfer1::DataType::kINT32 &&
ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR);
case 1:
// indices
return ioDesc[pos].type == nvinfer1::DataType::kINT32 &&
ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR;
case 2:
// output
return ioDesc[pos].type == ioDesc[0].type && ioDesc[pos].format == ioDesc[0].format;
default:
return true;
}
return true;
}
void GatherTopk::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
const nvinfer1::DynamicPluginTensorDesc *outputs,
int nbOutputs) TRT_NOEXCEPT {}
size_t GatherTopk::getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
const nvinfer1::PluginTensorDesc *outputs,
int nbOutputs) const TRT_NOEXCEPT {
return 0;
}
int GatherTopk::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
void *const *outputs, void *workSpace, cudaStream_t stream) TRT_NOEXCEPT {
const int *dims = &(inputDesc[0].dims.d[0]);
const int *indices_dims = &(inputDesc[1].dims.d[0]);
int nbDims = inputDesc[0].dims.nbDims;
int indice_nbDims = inputDesc[1].dims.nbDims;
const void *data = inputs[0];
const void *indices = inputs[1];
void *output = outputs[0];
auto data_type = inputDesc[0].type;
switch (data_type) {
case nvinfer1::DataType::kFLOAT:
gather_topk_impl<float>((float *)data, (int *)indices, dims, nbDims, indices_dims,
indice_nbDims, (float *)output, stream);
break;
case nvinfer1::DataType::kINT32:
gather_topk_impl<int>((int *)data, (int *)indices, dims, nbDims, indices_dims, indice_nbDims,
(int *)output, stream);
break;
default:
break;
}
return 0;
}
nvinfer1::DataType GatherTopk::getOutputDataType(int index, const nvinfer1::DataType *inputTypes,
int nbInputs) const TRT_NOEXCEPT {
return inputTypes[0];
}
// IPluginV2 Methods
const char *GatherTopk::getPluginType() const TRT_NOEXCEPT { return PLUGIN_NAME; }
const char *GatherTopk::getPluginVersion() const TRT_NOEXCEPT { return PLUGIN_VERSION; }
int GatherTopk::getNbOutputs() const TRT_NOEXCEPT { return 1; }
size_t GatherTopk::getSerializationSize() const TRT_NOEXCEPT { return 0; }
void GatherTopk::serialize(void *buffer) const TRT_NOEXCEPT {}
GatherTopkCreator::GatherTopkCreator() {
mPluginAttributes.clear();
mFC.nbFields = mPluginAttributes.size();
mFC.fields = mPluginAttributes.data();
}
const char *GatherTopkCreator::getPluginName() const TRT_NOEXCEPT { return PLUGIN_NAME; }
const char *GatherTopkCreator::getPluginVersion() const TRT_NOEXCEPT { return PLUGIN_VERSION; }
nvinfer1::IPluginV2 *GatherTopkCreator::createPlugin(
const char *name, const nvinfer1::PluginFieldCollection *fc) TRT_NOEXCEPT {
auto *plugin = new GatherTopk(name);
plugin->setPluginNamespace(getPluginNamespace());
return plugin;
}
nvinfer1::IPluginV2 *GatherTopkCreator::deserializePlugin(const char *name, const void *serialData,
size_t serialLength) TRT_NOEXCEPT {
auto plugin = new GatherTopk(name, serialData, serialLength);
plugin->setPluginNamespace(getPluginNamespace());
return plugin;
}
REGISTER_TENSORRT_PLUGIN(GatherTopkCreator);
} // namespace mmdeploy
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment