Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
mmdeploy
Commits
546b4279
Commit
546b4279
authored
Jun 25, 2025
by
limm
Browse files
add csrc and mmdeploy module
parent
502f4fb9
Pipeline
#2810
canceled with stages
Changes
447
Pipelines
1
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2841 additions
and
0 deletions
+2841
-0
csrc/mmdeploy/backend_ops/tensorrt/common/nms/cub_helper.h
csrc/mmdeploy/backend_ops/tensorrt/common/nms/cub_helper.h
+15
-0
csrc/mmdeploy/backend_ops/tensorrt/common/nms/kernel.h
csrc/mmdeploy/backend_ops/tensorrt/common/nms/kernel.h
+86
-0
csrc/mmdeploy/backend_ops/tensorrt/common/trt_plugin_base.hpp
.../mmdeploy/backend_ops/tensorrt/common/trt_plugin_base.hpp
+77
-0
csrc/mmdeploy/backend_ops/tensorrt/common/trt_plugin_helper.hpp
...mdeploy/backend_ops/tensorrt/common/trt_plugin_helper.hpp
+155
-0
csrc/mmdeploy/backend_ops/tensorrt/common/trt_serialize.hpp
csrc/mmdeploy/backend_ops/tensorrt/common/trt_serialize.hpp
+97
-0
csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/allClassNMS.cu
...eploy/backend_ops/tensorrt/common_impl/nms/allClassNMS.cu
+267
-0
csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/allClassRotatedNMS.cu
...ackend_ops/tensorrt/common_impl/nms/allClassRotatedNMS.cu
+495
-0
csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/batched_nms_kernel.cpp
...ckend_ops/tensorrt/common_impl/nms/batched_nms_kernel.cpp
+125
-0
csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/gatherNMSOutputs.cu
.../backend_ops/tensorrt/common_impl/nms/gatherNMSOutputs.cu
+164
-0
csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/kernel.cu
csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/kernel.cu
+107
-0
csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/permuteData.cu
...eploy/backend_ops/tensorrt/common_impl/nms/permuteData.cu
+76
-0
csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/sortScoresPerClass.cu
...ackend_ops/tensorrt/common_impl/nms/sortScoresPerClass.cu
+141
-0
csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/sortScoresPerImage.cu
...ackend_ops/tensorrt/common_impl/nms/sortScoresPerImage.cu
+81
-0
csrc/mmdeploy/backend_ops/tensorrt/common_impl/trt_cuda_helper.cu
...eploy/backend_ops/tensorrt/common_impl/trt_cuda_helper.cu
+95
-0
csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv.cpp
...ploy/backend_ops/tensorrt/deform_conv/trt_deform_conv.cpp
+263
-0
csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv.hpp
...ploy/backend_ops/tensorrt/deform_conv/trt_deform_conv.hpp
+81
-0
csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv_kernel.cu
...ackend_ops/tensorrt/deform_conv/trt_deform_conv_kernel.cu
+172
-0
csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv_kernel.cuh
...ckend_ops/tensorrt/deform_conv/trt_deform_conv_kernel.cuh
+174
-0
csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv_kernel.hpp
...ckend_ops/tensorrt/deform_conv/trt_deform_conv_kernel.hpp
+20
-0
csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk.cpp
...mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk.cpp
+150
-0
No files found.
Too many changes to show.
To preserve performance only
447 of 447+
files are displayed.
Plain diff
Email patch
csrc/mmdeploy/backend_ops/tensorrt/common/nms/cub_helper.h
0 → 100644
View file @
546b4279
// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
// modify from
// https://github.com/NVIDIA/TensorRT/tree/master/plugin/batchedNMSPlugin
#include "kernel.h"
template
<
typename
KeyT
,
typename
ValueT
>
size_t
cubSortPairsWorkspaceSize
(
int
num_items
,
int
num_segments
)
{
size_t
temp_storage_bytes
=
0
;
cub
::
DeviceSegmentedRadixSort
::
SortPairsDescending
((
void
*
)
NULL
,
temp_storage_bytes
,
(
const
KeyT
*
)
NULL
,
(
KeyT
*
)
NULL
,
(
const
ValueT
*
)
NULL
,
(
ValueT
*
)
NULL
,
num_items
,
// # items
num_segments
,
// # segments
(
const
int
*
)
NULL
,
(
const
int
*
)
NULL
);
return
temp_storage_bytes
;
}
csrc/mmdeploy/backend_ops/tensorrt/common/nms/kernel.h
0 → 100644
View file @
546b4279
// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
// modify from
// https://github.com/NVIDIA/TensorRT/tree/master/plugin/batchedNMSPlugin
#ifndef TRT_KERNEL_H
#define TRT_KERNEL_H
#include <cuda_runtime.h>
#include <cassert>
#include <cstdio>
#include "cublas_v2.h"
#include "trt_plugin_helper.hpp"
using
namespace
nvinfer1
;
#define DEBUG_ENABLE 0
template
<
typename
T
>
struct
Bbox
{
T
xmin
,
ymin
,
xmax
,
ymax
;
Bbox
(
T
xmin
,
T
ymin
,
T
xmax
,
T
ymax
)
:
xmin
(
xmin
),
ymin
(
ymin
),
xmax
(
xmax
),
ymax
(
ymax
)
{}
Bbox
()
=
default
;
};
size_t
get_cuda_arch
(
int
devID
);
int8_t
*
alignPtr
(
int8_t
*
ptr
,
uintptr_t
to
);
int8_t
*
nextWorkspacePtr
(
int8_t
*
ptr
,
uintptr_t
previousWorkspaceSize
);
void
setUniformOffsets
(
cudaStream_t
stream
,
int
num_segments
,
int
offset
,
int
*
d_offsets
);
pluginStatus_t
allClassNMS
(
cudaStream_t
stream
,
int
num
,
int
num_classes
,
int
num_preds_per_class
,
int
top_k
,
float
nms_threshold
,
bool
share_location
,
bool
isNormalized
,
DataType
DT_SCORE
,
DataType
DT_BBOX
,
void
*
bbox_data
,
void
*
beforeNMS_scores
,
void
*
beforeNMS_index_array
,
void
*
afterNMS_scores
,
void
*
afterNMS_index_array
,
bool
flipXY
=
false
);
pluginStatus_t
allClassRotatedNMS
(
cudaStream_t
stream
,
int
num
,
int
num_classes
,
int
num_preds_per_class
,
int
top_k
,
float
nms_threshold
,
bool
share_location
,
bool
isNormalized
,
DataType
DT_SCORE
,
DataType
DT_BBOX
,
void
*
bbox_data
,
void
*
beforeNMS_scores
,
void
*
beforeNMS_index_array
,
void
*
afterNMS_scores
,
void
*
afterNMS_index_array
,
bool
flipXY
=
false
);
size_t
detectionForwardBBoxDataSize
(
int
N
,
int
C1
,
DataType
DT_BBOX
);
size_t
detectionForwardBBoxPermuteSize
(
bool
shareLocation
,
int
N
,
int
C1
,
DataType
DT_BBOX
);
size_t
sortScoresPerClassWorkspaceSize
(
int
num
,
int
num_classes
,
int
num_preds_per_class
,
DataType
DT_CONF
);
size_t
sortScoresPerImageWorkspaceSize
(
int
num_images
,
int
num_items_per_image
,
DataType
DT_SCORE
);
pluginStatus_t
sortScoresPerImage
(
cudaStream_t
stream
,
int
num_images
,
int
num_items_per_image
,
DataType
DT_SCORE
,
void
*
unsorted_scores
,
void
*
unsorted_bbox_indices
,
void
*
sorted_scores
,
void
*
sorted_bbox_indices
,
void
*
workspace
);
pluginStatus_t
sortScoresPerClass
(
cudaStream_t
stream
,
int
num
,
int
num_classes
,
int
num_preds_per_class
,
int
background_label_id
,
float
confidence_threshold
,
DataType
DT_SCORE
,
void
*
conf_scores_gpu
,
void
*
index_array_gpu
,
void
*
workspace
);
size_t
calculateTotalWorkspaceSize
(
size_t
*
workspaces
,
int
count
);
pluginStatus_t
permuteData
(
cudaStream_t
stream
,
int
nthreads
,
int
num_classes
,
int
num_data
,
int
num_dim
,
DataType
DT_DATA
,
bool
confSigmoid
,
const
void
*
data
,
void
*
new_data
);
size_t
detectionForwardPreNMSSize
(
int
N
,
int
C2
);
size_t
detectionForwardPostNMSSize
(
int
N
,
int
numClasses
,
int
topK
);
pluginStatus_t
gatherNMSOutputs
(
cudaStream_t
stream
,
bool
shareLocation
,
int
numImages
,
int
numPredsPerClass
,
int
numClasses
,
int
topK
,
int
keepTopK
,
DataType
DT_BBOX
,
DataType
DT_SCORE
,
const
void
*
indices
,
const
void
*
scores
,
const
void
*
bboxData
,
void
*
nmsedDets
,
void
*
nmsedLabels
,
void
*
nmsedIndex
=
nullptr
,
bool
clipBoxes
=
true
,
bool
rotated
=
false
);
size_t
detectionInferenceWorkspaceSize
(
bool
shareLocation
,
int
N
,
int
C1
,
int
C2
,
int
numClasses
,
int
numPredsPerClass
,
int
topK
,
DataType
DT_BBOX
,
DataType
DT_SCORE
);
#endif
csrc/mmdeploy/backend_ops/tensorrt/common/trt_plugin_base.hpp
0 → 100644
View file @
546b4279
// Copyright (c) OpenMMLab. All rights reserved.
#ifndef TRT_PLUGIN_BASE_HPP
#define TRT_PLUGIN_BASE_HPP
#include "NvInferRuntime.h"
#include "NvInferVersion.h"
#include "trt_plugin_helper.hpp"
namespace
mmdeploy
{
#if NV_TENSORRT_MAJOR > 7
#define TRT_NOEXCEPT noexcept
#else
#define TRT_NOEXCEPT
#endif
class
TRTPluginBase
:
public
nvinfer1
::
IPluginV2DynamicExt
{
public:
TRTPluginBase
(
const
std
::
string
&
name
)
:
mLayerName
(
name
)
{}
// IPluginV2 Methods
const
char
*
getPluginVersion
()
const
TRT_NOEXCEPT
override
{
return
"1"
;
}
int
initialize
()
TRT_NOEXCEPT
override
{
return
STATUS_SUCCESS
;
}
void
terminate
()
TRT_NOEXCEPT
override
{}
void
destroy
()
TRT_NOEXCEPT
override
{
delete
this
;
}
void
setPluginNamespace
(
const
char
*
pluginNamespace
)
TRT_NOEXCEPT
override
{
mNamespace
=
pluginNamespace
;
}
const
char
*
getPluginNamespace
()
const
TRT_NOEXCEPT
override
{
return
mNamespace
.
c_str
();
}
virtual
void
configurePlugin
(
const
nvinfer1
::
DynamicPluginTensorDesc
*
in
,
int
nbInputs
,
const
nvinfer1
::
DynamicPluginTensorDesc
*
out
,
int
nbOutputs
)
TRT_NOEXCEPT
override
{}
virtual
size_t
getWorkspaceSize
(
const
nvinfer1
::
PluginTensorDesc
*
inputs
,
int
nbInputs
,
const
nvinfer1
::
PluginTensorDesc
*
outputs
,
int
nbOutputs
)
const
TRT_NOEXCEPT
override
{
return
0
;
}
virtual
void
attachToContext
(
cudnnContext
*
cudnnContext
,
cublasContext
*
cublasContext
,
nvinfer1
::
IGpuAllocator
*
gpuAllocator
)
TRT_NOEXCEPT
override
{}
virtual
void
detachFromContext
()
TRT_NOEXCEPT
override
{}
protected:
const
std
::
string
mLayerName
;
std
::
string
mNamespace
;
#if NV_TENSORRT_MAJOR < 8
protected:
// To prevent compiler warnings.
using
nvinfer1
::
IPluginV2DynamicExt
::
canBroadcastInputAcrossBatch
;
using
nvinfer1
::
IPluginV2DynamicExt
::
enqueue
;
using
nvinfer1
::
IPluginV2DynamicExt
::
getOutputDimensions
;
using
nvinfer1
::
IPluginV2DynamicExt
::
isOutputBroadcastAcrossBatch
;
using
nvinfer1
::
IPluginV2DynamicExt
::
supportsFormat
;
#endif
};
class
TRTPluginCreatorBase
:
public
nvinfer1
::
IPluginCreator
{
public:
const
char
*
getPluginVersion
()
const
TRT_NOEXCEPT
override
{
return
"1"
;
};
const
nvinfer1
::
PluginFieldCollection
*
getFieldNames
()
TRT_NOEXCEPT
override
{
return
&
mFC
;
}
void
setPluginNamespace
(
const
char
*
pluginNamespace
)
TRT_NOEXCEPT
override
{
mNamespace
=
pluginNamespace
;
}
const
char
*
getPluginNamespace
()
const
TRT_NOEXCEPT
override
{
return
mNamespace
.
c_str
();
}
protected:
nvinfer1
::
PluginFieldCollection
mFC
;
std
::
vector
<
nvinfer1
::
PluginField
>
mPluginAttributes
;
std
::
string
mNamespace
;
};
}
// namespace mmdeploy
#endif
csrc/mmdeploy/backend_ops/tensorrt/common/trt_plugin_helper.hpp
0 → 100644
View file @
546b4279
// Copyright (c) OpenMMLab. All rights reserved.
#ifndef TRT_PLUGIN_HELPER_HPP
#define TRT_PLUGIN_HELPER_HPP
#include <cudnn.h>
#include <iostream>
#include <stdexcept>
#include "NvInferRuntime.h"
cudnnStatus_t
convert_trt2cudnn_dtype
(
nvinfer1
::
DataType
trt_dtype
,
cudnnDataType_t
*
cudnn_dtype
);
// Enumerator for status
typedef
enum
{
STATUS_SUCCESS
=
0
,
STATUS_FAILURE
=
1
,
STATUS_BAD_PARAM
=
2
,
STATUS_NOT_SUPPORTED
=
3
,
STATUS_NOT_INITIALIZED
=
4
}
pluginStatus_t
;
#define ASSERT(assertion) \
{ \
if (!(assertion)) { \
std::cerr << "#assertion" << __FILE__ << "," << __LINE__ << std::endl; \
abort(); \
} \
}
#define CUASSERT(status_) \
{ \
auto s_ = status_; \
if (s_ != cudaSuccess) { \
std::cerr << __FILE__ << ", " << __LINE__ << ", " << s_ << ", " << cudaGetErrorString(s_) \
<< std::endl; \
} \
}
#define CUBLASASSERT(status_) \
{ \
auto s_ = status_; \
if (s_ != CUBLAS_STATUS_SUCCESS) { \
std::cerr << __FILE__ << ", " << __LINE__ << ", " << s_ << std::endl; \
} \
}
#define CUERRORMSG(status_) \
{ \
auto s_ = status_; \
if (s_ != 0) std::cerr << __FILE__ << ", " << __LINE__ << ", " << s_ << std::endl; \
}
#ifndef DEBUG
#define CHECK(status) \
do { \
if (status != 0) abort(); \
} while (0)
#define ASSERT_PARAM(exp) \
do { \
if (!(exp)) return STATUS_BAD_PARAM; \
} while (0)
#define ASSERT_FAILURE(exp) \
do { \
if (!(exp)) return STATUS_FAILURE; \
} while (0)
#define CSC(call, err) \
do { \
cudaError_t cudaStatus = call; \
if (cudaStatus != cudaSuccess) { \
return err; \
} \
} while (0)
#define DEBUG_PRINTF(...) \
do { \
} while (0)
#else
#define ASSERT_PARAM(exp) \
do { \
if (!(exp)) { \
fprintf(stderr, "Bad param - " #exp ", %s:%d\n", __FILE__, __LINE__); \
return STATUS_BAD_PARAM; \
} \
} while (0)
#define ASSERT_FAILURE(exp) \
do { \
if (!(exp)) { \
fprintf(stderr, "Failure - " #exp ", %s:%d\n", __FILE__, __LINE__); \
return STATUS_FAILURE; \
} \
} while (0)
#define CSC(call, err) \
do { \
cudaError_t cudaStatus = call; \
if (cudaStatus != cudaSuccess) { \
printf("%s %d CUDA FAIL %s\n", __FILE__, __LINE__, cudaGetErrorString(cudaStatus)); \
return err; \
} \
} while (0)
#define CHECK(status) \
{ \
if (status != 0) { \
DEBUG_PRINTF("%s %d CUDA FAIL %s\n", __FILE__, __LINE__, cudaGetErrorString(status)); \
abort(); \
} \
}
#define DEBUG_PRINTF(...) \
do { \
printf(__VA_ARGS__); \
} while (0)
#endif
namespace
mmdeploy
{
const
int
MAXTENSORDIMS
=
10
;
struct
TensorDesc
{
int
shape
[
MAXTENSORDIMS
];
int
stride
[
MAXTENSORDIMS
];
int
dim
;
};
inline
unsigned
int
getElementSize
(
nvinfer1
::
DataType
t
)
{
switch
(
t
)
{
case
nvinfer1
::
DataType
::
kINT32
:
return
4
;
case
nvinfer1
::
DataType
::
kFLOAT
:
return
4
;
case
nvinfer1
::
DataType
::
kHALF
:
return
2
;
// case nvinfer1::DataType::kBOOL:
case
nvinfer1
::
DataType
::
kINT8
:
return
1
;
default:
throw
std
::
runtime_error
(
"Invalid DataType."
);
}
throw
std
::
runtime_error
(
"Invalid DataType."
);
return
0
;
}
inline
size_t
getAlignedSize
(
size_t
origin_size
,
size_t
aligned_number
=
16
)
{
return
size_t
((
origin_size
+
aligned_number
-
1
)
/
aligned_number
)
*
aligned_number
;
}
}
// namespace mmdeploy
#endif // TRT_PLUGIN_HELPER_HPP
csrc/mmdeploy/backend_ops/tensorrt/common/trt_serialize.hpp
0 → 100644
View file @
546b4279
// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
// Modified from:
// https://github.com/NVIDIA/TensorRT/blob/master/plugin/common/serialize.hpp
#ifndef TRT_SERIALIZE_HPP
#define TRT_SERIALIZE_HPP
#include <cassert>
#include <cstring>
#include <type_traits>
#include <vector>
template
<
typename
T
>
inline
void
serialize_value
(
void
**
buffer
,
T
const
&
value
);
template
<
typename
T
>
inline
void
deserialize_value
(
void
const
**
buffer
,
size_t
*
buffer_size
,
T
*
value
);
namespace
{
template
<
typename
T
,
class
Enable
=
void
>
struct
Serializer
{};
template
<
typename
T
>
struct
Serializer
<
T
,
typename
std
::
enable_if
<
std
::
is_arithmetic
<
T
>::
value
||
std
::
is_enum
<
T
>::
value
||
std
::
is_pod
<
T
>::
value
>::
type
>
{
static
size_t
serialized_size
(
T
const
&
value
)
{
return
sizeof
(
T
);
}
static
void
serialize
(
void
**
buffer
,
T
const
&
value
)
{
::
memcpy
(
*
buffer
,
&
value
,
sizeof
(
T
));
reinterpret_cast
<
char
*&>
(
*
buffer
)
+=
sizeof
(
T
);
}
static
void
deserialize
(
void
const
**
buffer
,
size_t
*
buffer_size
,
T
*
value
)
{
assert
(
*
buffer_size
>=
sizeof
(
T
));
::
memcpy
(
value
,
*
buffer
,
sizeof
(
T
));
reinterpret_cast
<
char
const
*&>
(
*
buffer
)
+=
sizeof
(
T
);
*
buffer_size
-=
sizeof
(
T
);
}
};
template
<
>
struct
Serializer
<
const
char
*>
{
static
size_t
serialized_size
(
const
char
*
value
)
{
return
strlen
(
value
)
+
1
;
}
static
void
serialize
(
void
**
buffer
,
const
char
*
value
)
{
::
strcpy
(
static_cast
<
char
*>
(
*
buffer
),
value
);
reinterpret_cast
<
char
*&>
(
*
buffer
)
+=
strlen
(
value
)
+
1
;
}
static
void
deserialize
(
void
const
**
buffer
,
size_t
*
buffer_size
,
const
char
**
value
)
{
*
value
=
static_cast
<
char
const
*>
(
*
buffer
);
size_t
data_size
=
strnlen
(
*
value
,
*
buffer_size
)
+
1
;
assert
(
*
buffer_size
>=
data_size
);
reinterpret_cast
<
char
const
*&>
(
*
buffer
)
+=
data_size
;
*
buffer_size
-=
data_size
;
}
};
template
<
typename
T
>
struct
Serializer
<
std
::
vector
<
T
>
,
typename
std
::
enable_if
<
std
::
is_arithmetic
<
T
>::
value
||
std
::
is_enum
<
T
>::
value
||
std
::
is_pod
<
T
>::
value
>::
type
>
{
static
size_t
serialized_size
(
std
::
vector
<
T
>
const
&
value
)
{
return
sizeof
(
value
.
size
())
+
value
.
size
()
*
sizeof
(
T
);
}
static
void
serialize
(
void
**
buffer
,
std
::
vector
<
T
>
const
&
value
)
{
serialize_value
(
buffer
,
value
.
size
());
size_t
nbyte
=
value
.
size
()
*
sizeof
(
T
);
::
memcpy
(
*
buffer
,
value
.
data
(),
nbyte
);
reinterpret_cast
<
char
*&>
(
*
buffer
)
+=
nbyte
;
}
static
void
deserialize
(
void
const
**
buffer
,
size_t
*
buffer_size
,
std
::
vector
<
T
>*
value
)
{
size_t
size
;
deserialize_value
(
buffer
,
buffer_size
,
&
size
);
value
->
resize
(
size
);
size_t
nbyte
=
value
->
size
()
*
sizeof
(
T
);
assert
(
*
buffer_size
>=
nbyte
);
::
memcpy
(
value
->
data
(),
*
buffer
,
nbyte
);
reinterpret_cast
<
char
const
*&>
(
*
buffer
)
+=
nbyte
;
*
buffer_size
-=
nbyte
;
}
};
}
// namespace
template
<
typename
T
>
inline
size_t
serialized_size
(
T
const
&
value
)
{
return
Serializer
<
T
>::
serialized_size
(
value
);
}
template
<
typename
T
>
inline
void
serialize_value
(
void
**
buffer
,
T
const
&
value
)
{
return
Serializer
<
T
>::
serialize
(
buffer
,
value
);
}
template
<
typename
T
>
inline
void
deserialize_value
(
void
const
**
buffer
,
size_t
*
buffer_size
,
T
*
value
)
{
return
Serializer
<
T
>::
deserialize
(
buffer
,
buffer_size
,
value
);
}
#endif // TRT_SERIALIZE_HPP
csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/allClassNMS.cu
0 → 100644
View file @
546b4279
// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
// modify from
// https://github.com/NVIDIA/TensorRT/tree/master/plugin/batchedNMSPlugin
#include <vector>
#include "nms/kernel.h"
const
static
int
BS
=
512
;
template
<
typename
T_BBOX
>
__device__
T_BBOX
bboxSize
(
const
Bbox
<
T_BBOX
>
&
bbox
,
const
bool
normalized
,
T_BBOX
offset
)
{
if
(
bbox
.
xmax
<
bbox
.
xmin
||
bbox
.
ymax
<
bbox
.
ymin
)
{
// If bbox is invalid (e.g. xmax < xmin or ymax < ymin), return 0.
return
0
;
}
else
{
T_BBOX
width
=
bbox
.
xmax
-
bbox
.
xmin
;
T_BBOX
height
=
bbox
.
ymax
-
bbox
.
ymin
;
if
(
normalized
)
{
return
width
*
height
;
}
else
{
// If bbox is not within range [0, 1].
return
(
width
+
offset
)
*
(
height
+
offset
);
}
}
}
template
<
typename
T_BBOX
>
__device__
void
intersectBbox
(
const
Bbox
<
T_BBOX
>
&
bbox1
,
const
Bbox
<
T_BBOX
>
&
bbox2
,
Bbox
<
T_BBOX
>
*
intersect_bbox
)
{
if
(
bbox2
.
xmin
>
bbox1
.
xmax
||
bbox2
.
xmax
<
bbox1
.
xmin
||
bbox2
.
ymin
>
bbox1
.
ymax
||
bbox2
.
ymax
<
bbox1
.
ymin
)
{
// Return [0, 0, 0, 0] if there is no intersection.
intersect_bbox
->
xmin
=
T_BBOX
(
0
);
intersect_bbox
->
ymin
=
T_BBOX
(
0
);
intersect_bbox
->
xmax
=
T_BBOX
(
0
);
intersect_bbox
->
ymax
=
T_BBOX
(
0
);
}
else
{
intersect_bbox
->
xmin
=
max
(
bbox1
.
xmin
,
bbox2
.
xmin
);
intersect_bbox
->
ymin
=
max
(
bbox1
.
ymin
,
bbox2
.
ymin
);
intersect_bbox
->
xmax
=
min
(
bbox1
.
xmax
,
bbox2
.
xmax
);
intersect_bbox
->
ymax
=
min
(
bbox1
.
ymax
,
bbox2
.
ymax
);
}
}
template
<
typename
T_BBOX
>
__device__
float
jaccardOverlap
(
const
Bbox
<
T_BBOX
>
&
bbox1
,
const
Bbox
<
T_BBOX
>
&
bbox2
,
const
bool
normalized
,
T_BBOX
offset
)
{
Bbox
<
T_BBOX
>
intersect_bbox
;
intersectBbox
(
bbox1
,
bbox2
,
&
intersect_bbox
);
float
intersect_width
,
intersect_height
;
if
(
normalized
)
{
intersect_width
=
intersect_bbox
.
xmax
-
intersect_bbox
.
xmin
;
intersect_height
=
intersect_bbox
.
ymax
-
intersect_bbox
.
ymin
;
}
else
{
intersect_width
=
intersect_bbox
.
xmax
-
intersect_bbox
.
xmin
+
offset
;
intersect_height
=
intersect_bbox
.
ymax
-
intersect_bbox
.
ymin
+
offset
;
}
if
(
intersect_width
>
0
&&
intersect_height
>
0
)
{
float
intersect_size
=
intersect_width
*
intersect_height
;
float
bbox1_size
=
bboxSize
(
bbox1
,
normalized
,
offset
);
float
bbox2_size
=
bboxSize
(
bbox2
,
normalized
,
offset
);
return
intersect_size
/
(
bbox1_size
+
bbox2_size
-
intersect_size
);
}
else
{
return
0.
;
}
}
/********** new NMS for only score and index array **********/
// clang-format off
template
<
typename
T_SCORE
,
typename
T_BBOX
,
int
TSIZE
>
__global__
void
#ifdef __CUDA_ARCH__
#if __CUDA_ARCH__ == 620 || __CUDA_ARCH__ == 530
__launch_bounds__
(
512
)
#endif
#endif
allClassNMS_kernel
(
const
int
num
,
const
int
num_classes
,
const
int
num_preds_per_class
,
const
int
top_k
,
const
float
nms_threshold
,
const
bool
share_location
,
const
bool
isNormalized
,
T_BBOX
*
bbox_data
,
// bbox_data should be float to preserve
// location information
T_SCORE
*
beforeNMS_scores
,
int
*
beforeNMS_index_array
,
T_SCORE
*
afterNMS_scores
,
int
*
afterNMS_index_array
,
bool
flipXY
=
false
)
{
// clang-format on
//__shared__ bool kept_bboxinfo_flag[CAFFE_CUDA_NUM_THREADS * TSIZE];
__shared__
bool
kept_bboxinfo_flag
[
TSIZE
*
BS
];
for
(
int
i
=
0
;
i
<
num
;
i
++
)
{
const
int
offset
=
i
*
num_classes
*
num_preds_per_class
+
blockIdx
.
x
*
num_preds_per_class
;
const
int
max_idx
=
offset
+
top_k
;
// put top_k bboxes into NMS calculation
const
int
bbox_idx_offset
=
share_location
?
(
i
*
num_preds_per_class
)
:
(
i
*
num_classes
*
num_preds_per_class
);
// local thread data
int
loc_bboxIndex
[
TSIZE
];
Bbox
<
T_BBOX
>
loc_bbox
[
TSIZE
];
// initialize Bbox, Bboxinfo, kept_bboxinfo_flag
// Eliminate shared memory RAW hazard
__syncthreads
();
#pragma unroll
for
(
int
t
=
0
;
t
<
TSIZE
;
t
++
)
{
const
int
cur_idx
=
threadIdx
.
x
+
blockDim
.
x
*
t
;
const
int
item_idx
=
offset
+
cur_idx
;
if
(
item_idx
<
max_idx
)
{
loc_bboxIndex
[
t
]
=
beforeNMS_index_array
[
item_idx
];
if
(
loc_bboxIndex
[
t
]
>=
0
)
// if (loc_bboxIndex[t] != -1)
{
const
int
bbox_data_idx
=
share_location
?
(
loc_bboxIndex
[
t
]
%
num_preds_per_class
+
bbox_idx_offset
)
:
loc_bboxIndex
[
t
];
loc_bbox
[
t
].
xmin
=
flipXY
?
bbox_data
[
bbox_data_idx
*
4
+
1
]
:
bbox_data
[
bbox_data_idx
*
4
+
0
];
loc_bbox
[
t
].
ymin
=
flipXY
?
bbox_data
[
bbox_data_idx
*
4
+
0
]
:
bbox_data
[
bbox_data_idx
*
4
+
1
];
loc_bbox
[
t
].
xmax
=
flipXY
?
bbox_data
[
bbox_data_idx
*
4
+
3
]
:
bbox_data
[
bbox_data_idx
*
4
+
2
];
loc_bbox
[
t
].
ymax
=
flipXY
?
bbox_data
[
bbox_data_idx
*
4
+
2
]
:
bbox_data
[
bbox_data_idx
*
4
+
3
];
kept_bboxinfo_flag
[
cur_idx
]
=
true
;
}
else
{
kept_bboxinfo_flag
[
cur_idx
]
=
false
;
}
}
else
{
kept_bboxinfo_flag
[
cur_idx
]
=
false
;
}
}
// filter out overlapped boxes with lower scores
int
ref_item_idx
=
offset
;
int
ref_bbox_idx
=
share_location
?
(
beforeNMS_index_array
[
ref_item_idx
]
%
num_preds_per_class
+
bbox_idx_offset
)
:
beforeNMS_index_array
[
ref_item_idx
];
while
((
ref_bbox_idx
!=
-
1
)
&&
ref_item_idx
<
max_idx
)
{
Bbox
<
T_BBOX
>
ref_bbox
;
ref_bbox
.
xmin
=
flipXY
?
bbox_data
[
ref_bbox_idx
*
4
+
1
]
:
bbox_data
[
ref_bbox_idx
*
4
+
0
];
ref_bbox
.
ymin
=
flipXY
?
bbox_data
[
ref_bbox_idx
*
4
+
0
]
:
bbox_data
[
ref_bbox_idx
*
4
+
1
];
ref_bbox
.
xmax
=
flipXY
?
bbox_data
[
ref_bbox_idx
*
4
+
3
]
:
bbox_data
[
ref_bbox_idx
*
4
+
2
];
ref_bbox
.
ymax
=
flipXY
?
bbox_data
[
ref_bbox_idx
*
4
+
2
]
:
bbox_data
[
ref_bbox_idx
*
4
+
3
];
// Eliminate shared memory RAW hazard
__syncthreads
();
for
(
int
t
=
0
;
t
<
TSIZE
;
t
++
)
{
const
int
cur_idx
=
threadIdx
.
x
+
blockDim
.
x
*
t
;
const
int
item_idx
=
offset
+
cur_idx
;
if
((
kept_bboxinfo_flag
[
cur_idx
])
&&
(
item_idx
>
ref_item_idx
))
{
// TODO: may need to add bool normalized as argument, HERE true means
// normalized
if
(
jaccardOverlap
(
ref_bbox
,
loc_bbox
[
t
],
isNormalized
,
T_BBOX
(
0
))
>
nms_threshold
)
{
kept_bboxinfo_flag
[
cur_idx
]
=
false
;
}
}
}
__syncthreads
();
do
{
ref_item_idx
++
;
}
while
(
ref_item_idx
<
max_idx
&&
!
kept_bboxinfo_flag
[
ref_item_idx
-
offset
]);
ref_bbox_idx
=
share_location
?
(
beforeNMS_index_array
[
ref_item_idx
]
%
num_preds_per_class
+
bbox_idx_offset
)
:
beforeNMS_index_array
[
ref_item_idx
];
}
// store data
for
(
int
t
=
0
;
t
<
TSIZE
;
t
++
)
{
const
int
cur_idx
=
threadIdx
.
x
+
blockDim
.
x
*
t
;
const
int
read_item_idx
=
offset
+
cur_idx
;
const
int
write_item_idx
=
(
i
*
num_classes
*
top_k
+
blockIdx
.
x
*
top_k
)
+
cur_idx
;
/*
* If not not keeping the bbox
* Set the score to 0
* Set the bounding box index to -1
*/
if
(
read_item_idx
<
max_idx
)
{
afterNMS_scores
[
write_item_idx
]
=
kept_bboxinfo_flag
[
cur_idx
]
?
beforeNMS_scores
[
read_item_idx
]
:
0.0
f
;
afterNMS_index_array
[
write_item_idx
]
=
kept_bboxinfo_flag
[
cur_idx
]
?
loc_bboxIndex
[
t
]
:
-
1
;
}
}
}
}
template
<
typename
T_SCORE
,
typename
T_BBOX
>
pluginStatus_t
allClassNMS_gpu
(
cudaStream_t
stream
,
const
int
num
,
const
int
num_classes
,
const
int
num_preds_per_class
,
const
int
top_k
,
const
float
nms_threshold
,
const
bool
share_location
,
const
bool
isNormalized
,
void
*
bbox_data
,
void
*
beforeNMS_scores
,
void
*
beforeNMS_index_array
,
void
*
afterNMS_scores
,
void
*
afterNMS_index_array
,
bool
flipXY
=
false
)
{
#define P(tsize) allClassNMS_kernel<T_SCORE, T_BBOX, (tsize)>
void
(
*
kernel
[
10
])(
const
int
,
const
int
,
const
int
,
const
int
,
const
float
,
const
bool
,
const
bool
,
float
*
,
T_SCORE
*
,
int
*
,
T_SCORE
*
,
int
*
,
bool
)
=
{
P
(
1
),
P
(
2
),
P
(
3
),
P
(
4
),
P
(
5
),
P
(
6
),
P
(
7
),
P
(
8
),
P
(
9
),
P
(
10
),
};
const
int
GS
=
num_classes
;
const
int
t_size
=
(
top_k
+
BS
-
1
)
/
BS
;
ASSERT
(
t_size
<=
10
);
kernel
[
t_size
-
1
]
<<<
GS
,
BS
,
0
,
stream
>>>
(
num
,
num_classes
,
num_preds_per_class
,
top_k
,
nms_threshold
,
share_location
,
isNormalized
,
(
T_BBOX
*
)
bbox_data
,
(
T_SCORE
*
)
beforeNMS_scores
,
(
int
*
)
beforeNMS_index_array
,
(
T_SCORE
*
)
afterNMS_scores
,
(
int
*
)
afterNMS_index_array
,
flipXY
);
cudaError_t
code
=
cudaGetLastError
();
CUASSERT
(
code
);
CSC
(
code
,
STATUS_FAILURE
);
return
STATUS_SUCCESS
;
}
// allClassNMS LAUNCH CONFIG
typedef
pluginStatus_t
(
*
nmsFunc
)(
cudaStream_t
,
const
int
,
const
int
,
const
int
,
const
int
,
const
float
,
const
bool
,
const
bool
,
void
*
,
void
*
,
void
*
,
void
*
,
void
*
,
bool
);
struct
nmsLaunchConfigSSD
{
DataType
t_score
;
DataType
t_bbox
;
nmsFunc
function
;
nmsLaunchConfigSSD
(
DataType
t_score
,
DataType
t_bbox
)
:
t_score
(
t_score
),
t_bbox
(
t_bbox
)
{}
nmsLaunchConfigSSD
(
DataType
t_score
,
DataType
t_bbox
,
nmsFunc
function
)
:
t_score
(
t_score
),
t_bbox
(
t_bbox
),
function
(
function
)
{}
bool
operator
==
(
const
nmsLaunchConfigSSD
&
other
)
{
return
t_score
==
other
.
t_score
&&
t_bbox
==
other
.
t_bbox
;
}
};
static
std
::
vector
<
nmsLaunchConfigSSD
>
nmsFuncVec
;
bool
nmsInit
()
{
nmsFuncVec
.
push_back
(
nmsLaunchConfigSSD
(
DataType
::
kFLOAT
,
DataType
::
kFLOAT
,
allClassNMS_gpu
<
float
,
float
>
));
return
true
;
}
static
bool
initialized
=
nmsInit
();
pluginStatus_t
allClassNMS
(
cudaStream_t
stream
,
const
int
num
,
const
int
num_classes
,
const
int
num_preds_per_class
,
const
int
top_k
,
const
float
nms_threshold
,
const
bool
share_location
,
const
bool
isNormalized
,
const
DataType
DT_SCORE
,
const
DataType
DT_BBOX
,
void
*
bbox_data
,
void
*
beforeNMS_scores
,
void
*
beforeNMS_index_array
,
void
*
afterNMS_scores
,
void
*
afterNMS_index_array
,
bool
flipXY
)
{
nmsLaunchConfigSSD
lc
(
DT_SCORE
,
DT_BBOX
);
for
(
unsigned
i
=
0
;
i
<
nmsFuncVec
.
size
();
++
i
)
{
if
(
lc
==
nmsFuncVec
[
i
])
{
DEBUG_PRINTF
(
"all class nms kernel %d
\n
"
,
i
);
return
nmsFuncVec
[
i
].
function
(
stream
,
num
,
num_classes
,
num_preds_per_class
,
top_k
,
nms_threshold
,
share_location
,
isNormalized
,
bbox_data
,
beforeNMS_scores
,
beforeNMS_index_array
,
afterNMS_scores
,
afterNMS_index_array
,
flipXY
);
}
}
return
STATUS_BAD_PARAM
;
}
csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/allClassRotatedNMS.cu
0 → 100644
View file @
546b4279
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
// modified from
// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_utils.h
#include <cmath>
#include <vector>
#include "nms/kernel.h"
template
<
typename
T
>
struct
RotatedBox
{
T
x_ctr
,
y_ctr
,
w
,
h
,
a
;
};
template
<
typename
T
>
struct
Point
{
T
x
,
y
;
__host__
__device__
__forceinline__
Point
(
const
T
&
px
=
0
,
const
T
&
py
=
0
)
:
x
(
px
),
y
(
py
)
{}
__host__
__device__
__forceinline__
Point
operator
+
(
const
Point
&
p
)
const
{
return
Point
(
x
+
p
.
x
,
y
+
p
.
y
);
}
__host__
__device__
__forceinline__
Point
&
operator
+=
(
const
Point
&
p
)
{
x
+=
p
.
x
;
y
+=
p
.
y
;
return
*
this
;
}
__host__
__device__
__forceinline__
Point
operator
-
(
const
Point
&
p
)
const
{
return
Point
(
x
-
p
.
x
,
y
-
p
.
y
);
}
__host__
__device__
__forceinline__
Point
operator
*
(
const
T
coeff
)
const
{
return
Point
(
x
*
coeff
,
y
*
coeff
);
}
};
template
<
typename
T
>
__host__
__device__
__forceinline__
T
dot_2d
(
const
Point
<
T
>
&
A
,
const
Point
<
T
>
&
B
)
{
return
A
.
x
*
B
.
x
+
A
.
y
*
B
.
y
;
}
template
<
typename
T
>
__host__
__device__
__forceinline__
T
cross_2d
(
const
Point
<
T
>
&
A
,
const
Point
<
T
>
&
B
)
{
return
A
.
x
*
B
.
y
-
B
.
x
*
A
.
y
;
}
template
<
typename
T
>
__host__
__device__
__forceinline__
void
get_rotated_vertices
(
const
RotatedBox
<
T
>
&
box
,
Point
<
T
>
(
&
pts
)[
4
])
{
// M_PI / 180. == 0.01745329251
// double theta = box.a * 0.01745329251;
// MODIFIED
double
theta
=
box
.
a
;
T
cosTheta2
=
(
T
)
cos
(
theta
)
*
0.5
f
;
T
sinTheta2
=
(
T
)
sin
(
theta
)
*
0.5
f
;
// y: top --> down; x: left --> right
pts
[
0
].
x
=
box
.
x_ctr
-
sinTheta2
*
box
.
h
-
cosTheta2
*
box
.
w
;
pts
[
0
].
y
=
box
.
y_ctr
+
cosTheta2
*
box
.
h
-
sinTheta2
*
box
.
w
;
pts
[
1
].
x
=
box
.
x_ctr
+
sinTheta2
*
box
.
h
-
cosTheta2
*
box
.
w
;
pts
[
1
].
y
=
box
.
y_ctr
-
cosTheta2
*
box
.
h
-
sinTheta2
*
box
.
w
;
pts
[
2
].
x
=
2
*
box
.
x_ctr
-
pts
[
0
].
x
;
pts
[
2
].
y
=
2
*
box
.
y_ctr
-
pts
[
0
].
y
;
pts
[
3
].
x
=
2
*
box
.
x_ctr
-
pts
[
1
].
x
;
pts
[
3
].
y
=
2
*
box
.
y_ctr
-
pts
[
1
].
y
;
}
template
<
typename
T
>
__host__
__device__
__forceinline__
int
get_intersection_points
(
const
Point
<
T
>
(
&
pts1
)[
4
],
const
Point
<
T
>
(
&
pts2
)[
4
],
Point
<
T
>
(
&
intersections
)[
24
])
{
// Line vector
// A line from p1 to p2 is: p1 + (p2-p1)*t, t=[0,1]
Point
<
T
>
vec1
[
4
],
vec2
[
4
];
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
vec1
[
i
]
=
pts1
[(
i
+
1
)
%
4
]
-
pts1
[
i
];
vec2
[
i
]
=
pts2
[(
i
+
1
)
%
4
]
-
pts2
[
i
];
}
// Line test - test all line combos for intersection
int
num
=
0
;
// number of intersections
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
for
(
int
j
=
0
;
j
<
4
;
j
++
)
{
// Solve for 2x2 Ax=b
T
det
=
cross_2d
<
T
>
(
vec2
[
j
],
vec1
[
i
]);
// This takes care of parallel lines
if
(
fabs
(
det
)
<=
1e-14
)
{
continue
;
}
auto
vec12
=
pts2
[
j
]
-
pts1
[
i
];
T
t1
=
cross_2d
<
T
>
(
vec2
[
j
],
vec12
)
/
det
;
T
t2
=
cross_2d
<
T
>
(
vec1
[
i
],
vec12
)
/
det
;
if
(
t1
>=
0.0
f
&&
t1
<=
1.0
f
&&
t2
>=
0.0
f
&&
t2
<=
1.0
f
)
{
intersections
[
num
++
]
=
pts1
[
i
]
+
vec1
[
i
]
*
t1
;
}
}
}
// Check for vertices of rect1 inside rect2
{
const
auto
&
AB
=
vec2
[
0
];
const
auto
&
DA
=
vec2
[
3
];
auto
ABdotAB
=
dot_2d
<
T
>
(
AB
,
AB
);
auto
ADdotAD
=
dot_2d
<
T
>
(
DA
,
DA
);
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
// assume ABCD is the rectangle, and P is the point to be judged
// P is inside ABCD iff. P's projection on AB lies within AB
// and P's projection on AD lies within AD
auto
AP
=
pts1
[
i
]
-
pts2
[
0
];
auto
APdotAB
=
dot_2d
<
T
>
(
AP
,
AB
);
auto
APdotAD
=
-
dot_2d
<
T
>
(
AP
,
DA
);
if
((
APdotAB
>=
0
)
&&
(
APdotAD
>=
0
)
&&
(
APdotAB
<=
ABdotAB
)
&&
(
APdotAD
<=
ADdotAD
))
{
intersections
[
num
++
]
=
pts1
[
i
];
}
}
}
// Reverse the check - check for vertices of rect2 inside rect1
{
const
auto
&
AB
=
vec1
[
0
];
const
auto
&
DA
=
vec1
[
3
];
auto
ABdotAB
=
dot_2d
<
T
>
(
AB
,
AB
);
auto
ADdotAD
=
dot_2d
<
T
>
(
DA
,
DA
);
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
auto
AP
=
pts2
[
i
]
-
pts1
[
0
];
auto
APdotAB
=
dot_2d
<
T
>
(
AP
,
AB
);
auto
APdotAD
=
-
dot_2d
<
T
>
(
AP
,
DA
);
if
((
APdotAB
>=
0
)
&&
(
APdotAD
>=
0
)
&&
(
APdotAB
<=
ABdotAB
)
&&
(
APdotAD
<=
ADdotAD
))
{
intersections
[
num
++
]
=
pts2
[
i
];
}
}
}
return
num
;
}
template
<
typename
T
>
__host__
__device__
__forceinline__
int
convex_hull_graham
(
const
Point
<
T
>
(
&
p
)[
24
],
const
int
&
num_in
,
Point
<
T
>
(
&
q
)[
24
],
bool
shift_to_zero
=
false
)
{
assert
(
num_in
>=
2
);
// Step 1:
// Find point with minimum y
// if more than 1 points have the same minimum y,
// pick the one with the minimum x.
int
t
=
0
;
for
(
int
i
=
1
;
i
<
num_in
;
i
++
)
{
if
(
p
[
i
].
y
<
p
[
t
].
y
||
(
p
[
i
].
y
==
p
[
t
].
y
&&
p
[
i
].
x
<
p
[
t
].
x
))
{
t
=
i
;
}
}
auto
&
start
=
p
[
t
];
// starting point
// Step 2:
// Subtract starting point from every points (for sorting in the next step)
for
(
int
i
=
0
;
i
<
num_in
;
i
++
)
{
q
[
i
]
=
p
[
i
]
-
start
;
}
// Swap the starting point to position 0
auto
tmp
=
q
[
0
];
q
[
0
]
=
q
[
t
];
q
[
t
]
=
tmp
;
// Step 3:
// Sort point 1 ~ num_in according to their relative cross-product values
// (essentially sorting according to angles)
// If the angles are the same, sort according to their distance to origin
T
dist
[
24
];
for
(
int
i
=
0
;
i
<
num_in
;
i
++
)
{
dist
[
i
]
=
dot_2d
<
T
>
(
q
[
i
],
q
[
i
]);
}
for
(
int
i
=
1
;
i
<
num_in
-
1
;
i
++
)
{
for
(
int
j
=
i
+
1
;
j
<
num_in
;
j
++
)
{
T
crossProduct
=
cross_2d
<
T
>
(
q
[
i
],
q
[
j
]);
if
((
crossProduct
<
-
1e-6
)
||
(
fabs
(
crossProduct
)
<
1e-6
&&
dist
[
i
]
>
dist
[
j
]))
{
auto
q_tmp
=
q
[
i
];
q
[
i
]
=
q
[
j
];
q
[
j
]
=
q_tmp
;
auto
dist_tmp
=
dist
[
i
];
dist
[
i
]
=
dist
[
j
];
dist
[
j
]
=
dist_tmp
;
}
}
}
// Step 4:
// Make sure there are at least 2 points (that don't overlap with each other)
// in the stack
int
k
;
// index of the non-overlapped second point
for
(
k
=
1
;
k
<
num_in
;
k
++
)
{
if
(
dist
[
k
]
>
1e-8
)
{
break
;
}
}
if
(
k
==
num_in
)
{
// We reach the end, which means the convex hull is just one point
q
[
0
]
=
p
[
t
];
return
1
;
}
q
[
1
]
=
q
[
k
];
int
m
=
2
;
// 2 points in the stack
// Step 5:
// Finally we can start the scanning process.
// When a non-convex relationship between the 3 points is found
// (either concave shape or duplicated points),
// we pop the previous point from the stack
// until the 3-point relationship is convex again, or
// until the stack only contains two points
for
(
int
i
=
k
+
1
;
i
<
num_in
;
i
++
)
{
while
(
m
>
1
&&
cross_2d
<
T
>
(
q
[
i
]
-
q
[
m
-
2
],
q
[
m
-
1
]
-
q
[
m
-
2
])
>=
0
)
{
m
--
;
}
q
[
m
++
]
=
q
[
i
];
}
// Step 6 (Optional):
// In general sense we need the original coordinates, so we
// need to shift the points back (reverting Step 2)
// But if we're only interested in getting the area/perimeter of the shape
// We can simply return.
if
(
!
shift_to_zero
)
{
for
(
int
i
=
0
;
i
<
m
;
i
++
)
{
q
[
i
]
+=
start
;
}
}
return
m
;
}
template
<
typename
T
>
__host__
__device__
__forceinline__
T
polygon_area
(
const
Point
<
T
>
(
&
q
)[
24
],
const
int
&
m
)
{
if
(
m
<=
2
)
{
return
0
;
}
T
area
=
0
;
for
(
int
i
=
1
;
i
<
m
-
1
;
i
++
)
{
area
+=
fabs
(
cross_2d
<
T
>
(
q
[
i
]
-
q
[
0
],
q
[
i
+
1
]
-
q
[
0
]));
}
return
area
/
2.0
;
}
template
<
typename
T
>
__host__
__device__
__forceinline__
T
rotated_boxes_intersection
(
const
RotatedBox
<
T
>
&
box1
,
const
RotatedBox
<
T
>
&
box2
)
{
// There are up to 4 x 4 + 4 + 4 = 24 intersections (including dups) returned
// from rotated_rect_intersection_pts
Point
<
T
>
intersectPts
[
24
],
orderedPts
[
24
];
Point
<
T
>
pts1
[
4
];
Point
<
T
>
pts2
[
4
];
get_rotated_vertices
<
T
>
(
box1
,
pts1
);
get_rotated_vertices
<
T
>
(
box2
,
pts2
);
int
num
=
get_intersection_points
<
T
>
(
pts1
,
pts2
,
intersectPts
);
if
(
num
<=
2
)
{
return
0.0
;
}
// Convex Hull to order the intersection points in clockwise order and find
// the contour area.
int
num_convex
=
convex_hull_graham
<
T
>
(
intersectPts
,
num
,
orderedPts
,
true
);
return
polygon_area
<
T
>
(
orderedPts
,
num_convex
);
}
template
<
typename
T
>
__host__
__device__
__forceinline__
T
single_box_iou_rotated
(
T
const
*
const
box1_raw
,
T
const
*
const
box2_raw
)
{
// shift center to the middle point to achieve higher precision in result
RotatedBox
<
T
>
box1
,
box2
;
auto
center_shift_x
=
(
box1_raw
[
0
]
+
box2_raw
[
0
])
/
2.0
;
auto
center_shift_y
=
(
box1_raw
[
1
]
+
box2_raw
[
1
])
/
2.0
;
box1
.
x_ctr
=
box1_raw
[
0
]
-
center_shift_x
;
box1
.
y_ctr
=
box1_raw
[
1
]
-
center_shift_y
;
box1
.
w
=
box1_raw
[
2
];
box1
.
h
=
box1_raw
[
3
];
box1
.
a
=
box1_raw
[
4
];
box2
.
x_ctr
=
box2_raw
[
0
]
-
center_shift_x
;
box2
.
y_ctr
=
box2_raw
[
1
]
-
center_shift_y
;
box2
.
w
=
box2_raw
[
2
];
box2
.
h
=
box2_raw
[
3
];
box2
.
a
=
box2_raw
[
4
];
const
T
area1
=
box1
.
w
*
box1
.
h
;
const
T
area2
=
box2
.
w
*
box2
.
h
;
if
(
area1
<
1e-14
||
area2
<
1e-14
)
{
return
1.0
f
;
}
const
T
intersection
=
rotated_boxes_intersection
<
T
>
(
box1
,
box2
);
T
baseS
=
1.0
;
baseS
=
(
area1
+
area2
-
intersection
);
const
T
iou
=
intersection
/
baseS
;
return
iou
;
}
/********** new NMS for only score and index array **********/
template
<
typename
T_SCORE
,
typename
T_BBOX
,
int
TSIZE
>
__global__
void
allClassRotatedNMS_kernel
(
const
int
num
,
const
int
num_classes
,
const
int
num_preds_per_class
,
const
int
top_k
,
const
float
nms_threshold
,
const
bool
share_location
,
const
bool
isNormalized
,
T_BBOX
*
bbox_data
,
// bbox_data should be float to
// preserve location information
T_SCORE
*
beforeNMS_scores
,
int
*
beforeNMS_index_array
,
T_SCORE
*
afterNMS_scores
,
int
*
afterNMS_index_array
)
{
//__shared__ bool kept_bboxinfo_flag[CAFFE_CUDA_NUM_THREADS * TSIZE];
extern
__shared__
bool
kept_bboxinfo_flag
[];
for
(
int
i
=
0
;
i
<
num
;
i
++
)
{
const
int
offset
=
i
*
num_classes
*
num_preds_per_class
+
blockIdx
.
x
*
num_preds_per_class
;
const
int
max_idx
=
offset
+
top_k
;
// put top_k bboxes into NMS calculation
const
int
bbox_idx_offset
=
share_location
?
(
i
*
num_preds_per_class
)
:
(
i
*
num_classes
*
num_preds_per_class
);
// local thread data
int
loc_bboxIndex
[
TSIZE
];
T_BBOX
loc_bbox
[
TSIZE
*
5
];
// initialize Bbox, Bboxinfo, kept_bboxinfo_flag
// Eliminate shared memory RAW hazard
__syncthreads
();
#pragma unroll
for
(
int
t
=
0
;
t
<
TSIZE
;
t
++
)
{
const
int
cur_idx
=
threadIdx
.
x
+
blockDim
.
x
*
t
;
const
int
item_idx
=
offset
+
cur_idx
;
if
(
item_idx
<
max_idx
)
{
loc_bboxIndex
[
t
]
=
beforeNMS_index_array
[
item_idx
];
if
(
loc_bboxIndex
[
t
]
>=
0
)
// if (loc_bboxIndex[t] != -1)
{
const
int
bbox_data_idx
=
share_location
?
(
loc_bboxIndex
[
t
]
%
num_preds_per_class
+
bbox_idx_offset
)
:
loc_bboxIndex
[
t
];
memcpy
(
&
loc_bbox
[
t
*
5
],
&
bbox_data
[
bbox_data_idx
*
5
],
5
*
sizeof
(
T_BBOX
));
kept_bboxinfo_flag
[
cur_idx
]
=
true
;
}
else
{
kept_bboxinfo_flag
[
cur_idx
]
=
false
;
}
}
else
{
kept_bboxinfo_flag
[
cur_idx
]
=
false
;
}
}
// filter out overlapped boxes with lower scores
int
ref_item_idx
=
offset
;
int
ref_bbox_idx
=
share_location
?
(
beforeNMS_index_array
[
ref_item_idx
]
%
num_preds_per_class
+
bbox_idx_offset
)
:
beforeNMS_index_array
[
ref_item_idx
];
while
((
ref_bbox_idx
!=
-
1
)
&&
ref_item_idx
<
max_idx
)
{
T_BBOX
ref_bbox
[
5
];
memcpy
(
&
ref_bbox
[
0
],
&
bbox_data
[
ref_bbox_idx
*
5
],
5
*
sizeof
(
T_BBOX
));
// Eliminate shared memory RAW hazard
__syncthreads
();
for
(
int
t
=
0
;
t
<
TSIZE
;
t
++
)
{
const
int
cur_idx
=
threadIdx
.
x
+
blockDim
.
x
*
t
;
const
int
item_idx
=
offset
+
cur_idx
;
if
((
kept_bboxinfo_flag
[
cur_idx
])
&&
(
item_idx
>
ref_item_idx
))
{
// TODO: may need to add bool normalized as argument, HERE true means
// normalized
if
(
single_box_iou_rotated
(
&
ref_bbox
[
0
],
loc_bbox
+
t
*
5
)
>
nms_threshold
)
{
kept_bboxinfo_flag
[
cur_idx
]
=
false
;
}
}
}
__syncthreads
();
do
{
ref_item_idx
++
;
}
while
(
ref_item_idx
<
max_idx
&&
!
kept_bboxinfo_flag
[
ref_item_idx
-
offset
]);
ref_bbox_idx
=
share_location
?
(
beforeNMS_index_array
[
ref_item_idx
]
%
num_preds_per_class
+
bbox_idx_offset
)
:
beforeNMS_index_array
[
ref_item_idx
];
}
// store data
for
(
int
t
=
0
;
t
<
TSIZE
;
t
++
)
{
const
int
cur_idx
=
threadIdx
.
x
+
blockDim
.
x
*
t
;
const
int
read_item_idx
=
offset
+
cur_idx
;
const
int
write_item_idx
=
(
i
*
num_classes
*
top_k
+
blockIdx
.
x
*
top_k
)
+
cur_idx
;
/*
* If not not keeping the bbox
* Set the score to 0
* Set the bounding box index to -1
*/
if
(
read_item_idx
<
max_idx
)
{
afterNMS_scores
[
write_item_idx
]
=
kept_bboxinfo_flag
[
cur_idx
]
?
beforeNMS_scores
[
read_item_idx
]
:
0.0
f
;
afterNMS_index_array
[
write_item_idx
]
=
kept_bboxinfo_flag
[
cur_idx
]
?
loc_bboxIndex
[
t
]
:
-
1
;
}
}
}
}
template
<
typename
T_SCORE
,
typename
T_BBOX
>
pluginStatus_t
allClassRotatedNMS_gpu
(
cudaStream_t
stream
,
const
int
num
,
const
int
num_classes
,
const
int
num_preds_per_class
,
const
int
top_k
,
const
float
nms_threshold
,
const
bool
share_location
,
const
bool
isNormalized
,
void
*
bbox_data
,
void
*
beforeNMS_scores
,
void
*
beforeNMS_index_array
,
void
*
afterNMS_scores
,
void
*
afterNMS_index_array
)
{
#define P(tsize) allClassRotatedNMS_kernel<T_SCORE, T_BBOX, (tsize)>
void
(
*
kernel
[
10
])(
const
int
,
const
int
,
const
int
,
const
int
,
const
float
,
const
bool
,
const
bool
,
float
*
,
T_SCORE
*
,
int
*
,
T_SCORE
*
,
int
*
)
=
{
P
(
1
),
P
(
2
),
P
(
3
),
P
(
4
),
P
(
5
),
P
(
6
),
P
(
7
),
P
(
8
),
P
(
9
),
P
(
10
),
};
const
int
BS
=
512
;
const
int
GS
=
num_classes
;
const
int
t_size
=
(
top_k
+
BS
-
1
)
/
BS
;
ASSERT
(
t_size
<=
10
);
kernel
[
t_size
-
1
]
<<<
GS
,
BS
,
BS
*
t_size
*
sizeof
(
bool
),
stream
>>>
(
num
,
num_classes
,
num_preds_per_class
,
top_k
,
nms_threshold
,
share_location
,
isNormalized
,
(
T_BBOX
*
)
bbox_data
,
(
T_SCORE
*
)
beforeNMS_scores
,
(
int
*
)
beforeNMS_index_array
,
(
T_SCORE
*
)
afterNMS_scores
,
(
int
*
)
afterNMS_index_array
);
CSC
(
cudaGetLastError
(),
STATUS_FAILURE
);
return
STATUS_SUCCESS
;
}
// allClassNMS LAUNCH CONFIG
typedef
pluginStatus_t
(
*
rotatedNmsFunc
)(
cudaStream_t
,
const
int
,
const
int
,
const
int
,
const
int
,
const
float
,
const
bool
,
const
bool
,
void
*
,
void
*
,
void
*
,
void
*
,
void
*
);
struct
rotatedNmsLaunchConfig
{
DataType
t_score
;
DataType
t_bbox
;
rotatedNmsFunc
function
;
rotatedNmsLaunchConfig
(
DataType
t_score
,
DataType
t_bbox
)
:
t_score
(
t_score
),
t_bbox
(
t_bbox
)
{}
rotatedNmsLaunchConfig
(
DataType
t_score
,
DataType
t_bbox
,
rotatedNmsFunc
function
)
:
t_score
(
t_score
),
t_bbox
(
t_bbox
),
function
(
function
)
{}
bool
operator
==
(
const
rotatedNmsLaunchConfig
&
other
)
{
return
t_score
==
other
.
t_score
&&
t_bbox
==
other
.
t_bbox
;
}
};
static
std
::
vector
<
rotatedNmsLaunchConfig
>
rotatedNmsFuncVec
;
bool
rotatedNmsInit
()
{
rotatedNmsFuncVec
.
push_back
(
rotatedNmsLaunchConfig
(
DataType
::
kFLOAT
,
DataType
::
kFLOAT
,
allClassRotatedNMS_gpu
<
float
,
float
>
));
return
true
;
}
static
bool
initialized
=
rotatedNmsInit
();
pluginStatus_t
allClassRotatedNMS
(
cudaStream_t
stream
,
const
int
num
,
const
int
num_classes
,
const
int
num_preds_per_class
,
const
int
top_k
,
const
float
nms_threshold
,
const
bool
share_location
,
const
bool
isNormalized
,
const
DataType
DT_SCORE
,
const
DataType
DT_BBOX
,
void
*
bbox_data
,
void
*
beforeNMS_scores
,
void
*
beforeNMS_index_array
,
void
*
afterNMS_scores
,
void
*
afterNMS_index_array
,
bool
)
{
auto
__cuda_arch__
=
get_cuda_arch
(
0
);
// assume there is only one arch 7.2 device
if
(
__cuda_arch__
==
720
&&
top_k
>=
1000
)
{
printf
(
"Warning: pre_top_k need to be reduced for devices with arch 7.2, got pre_top_k=%d
\n
"
,
top_k
);
}
rotatedNmsLaunchConfig
lc
(
DT_SCORE
,
DT_BBOX
);
for
(
unsigned
i
=
0
;
i
<
rotatedNmsFuncVec
.
size
();
++
i
)
{
if
(
lc
==
rotatedNmsFuncVec
[
i
])
{
DEBUG_PRINTF
(
"all class rotated nms kernel %d
\n
"
,
i
);
return
rotatedNmsFuncVec
[
i
].
function
(
stream
,
num
,
num_classes
,
num_preds_per_class
,
top_k
,
nms_threshold
,
share_location
,
isNormalized
,
bbox_data
,
beforeNMS_scores
,
beforeNMS_index_array
,
afterNMS_scores
,
afterNMS_index_array
);
}
}
return
STATUS_BAD_PARAM
;
}
csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/batched_nms_kernel.cpp
0 → 100644
View file @
546b4279
// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
// modify from
// https://github.com/NVIDIA/TensorRT/tree/master/plugin/batchedNMSPlugin
#include "nms/batched_nms_kernel.hpp"
pluginStatus_t
nmsInference
(
cudaStream_t
stream
,
const
int
N
,
const
int
perBatchBoxesSize
,
const
int
perBatchScoresSize
,
const
bool
shareLocation
,
const
int
backgroundLabelId
,
const
int
numPredsPerClass
,
const
int
numClasses
,
const
int
topK
,
const
int
keepTopK
,
const
float
scoreThreshold
,
const
float
iouThreshold
,
const
DataType
DT_BBOX
,
const
void
*
locData
,
const
DataType
DT_SCORE
,
const
void
*
confData
,
void
*
nmsedDets
,
void
*
nmsedLabels
,
void
*
nmsedIndex
,
void
*
workspace
,
bool
isNormalized
,
bool
confSigmoid
,
bool
clipBoxes
,
bool
rotated
)
{
const
int
topKVal
=
topK
<
0
?
numPredsPerClass
:
topK
;
const
int
keepTopKVal
=
keepTopK
<
0
?
numPredsPerClass
:
keepTopK
;
// locCount = batch_size * number_boxes_per_sample * 4
const
int
locCount
=
N
*
perBatchBoxesSize
;
/*
* shareLocation
* Bounding box are shared among all classes, i.e., a bounding box could be
* classified as any candidate class. Otherwise Bounding box are designed for
* specific classes, i.e., a bounding box could be classified as one certain
* class or not (binary classification).
*/
const
int
numLocClasses
=
shareLocation
?
1
:
numClasses
;
size_t
bboxDataSize
=
detectionForwardBBoxDataSize
(
N
,
perBatchBoxesSize
,
DataType
::
kFLOAT
);
void
*
bboxDataRaw
=
workspace
;
cudaMemcpyAsync
(
bboxDataRaw
,
locData
,
bboxDataSize
,
cudaMemcpyDeviceToDevice
,
stream
);
pluginStatus_t
status
;
/*
* bboxDataRaw format:
* [batch size, numPriors (per sample), numLocClasses, 4]
*/
// float for now
void
*
bboxData
;
size_t
bboxPermuteSize
=
detectionForwardBBoxPermuteSize
(
shareLocation
,
N
,
perBatchBoxesSize
,
DataType
::
kFLOAT
);
void
*
bboxPermute
=
nextWorkspacePtr
((
int8_t
*
)
bboxDataRaw
,
bboxDataSize
);
/*
* After permutation, bboxData format:
* [batch_size, numLocClasses, numPriors (per sample) (numPredsPerClass), 4]
* This is equivalent to swapping axis
*/
if
(
!
shareLocation
)
{
status
=
permuteData
(
stream
,
locCount
,
numLocClasses
,
numPredsPerClass
,
rotated
?
5
:
4
,
DataType
::
kFLOAT
,
false
,
bboxDataRaw
,
bboxPermute
);
ASSERT_FAILURE
(
status
==
STATUS_SUCCESS
);
bboxData
=
bboxPermute
;
}
/*
* If shareLocation, numLocClasses = 1
* No need to permute data on linear memory
*/
else
{
bboxData
=
bboxDataRaw
;
}
/*
* Conf data format
* [batch size, numPriors * param.numClasses, 1, 1]
*/
const
int
numScores
=
N
*
perBatchScoresSize
;
size_t
totalScoresSize
=
detectionForwardPreNMSSize
(
N
,
perBatchScoresSize
);
void
*
scores
=
nextWorkspacePtr
((
int8_t
*
)
bboxPermute
,
bboxPermuteSize
);
// need a conf_scores
/*
* After permutation, bboxData format:
* [batch_size, numClasses, numPredsPerClass, 1]
*/
status
=
permuteData
(
stream
,
numScores
,
numClasses
,
numPredsPerClass
,
1
,
DataType
::
kFLOAT
,
confSigmoid
,
confData
,
scores
);
ASSERT_FAILURE
(
status
==
STATUS_SUCCESS
);
size_t
indicesSize
=
detectionForwardPreNMSSize
(
N
,
perBatchScoresSize
);
void
*
indices
=
nextWorkspacePtr
((
int8_t
*
)
scores
,
totalScoresSize
);
size_t
postNMSScoresSize
=
detectionForwardPostNMSSize
(
N
,
numClasses
,
topKVal
);
size_t
postNMSIndicesSize
=
detectionForwardPostNMSSize
(
N
,
numClasses
,
topKVal
);
void
*
postNMSScores
=
nextWorkspacePtr
((
int8_t
*
)
indices
,
indicesSize
);
void
*
postNMSIndices
=
nextWorkspacePtr
((
int8_t
*
)
postNMSScores
,
postNMSScoresSize
);
void
*
sortingWorkspace
=
nextWorkspacePtr
((
int8_t
*
)
postNMSIndices
,
postNMSIndicesSize
);
// Sort the scores so that the following NMS could be applied.
status
=
sortScoresPerClass
(
stream
,
N
,
numClasses
,
numPredsPerClass
,
backgroundLabelId
,
scoreThreshold
,
DataType
::
kFLOAT
,
scores
,
indices
,
sortingWorkspace
);
ASSERT_FAILURE
(
status
==
STATUS_SUCCESS
);
// This is set to true as the input bounding boxes are of the format [ymin,
// xmin, ymax, xmax]. The default implementation assumes [xmin, ymin, xmax,
// ymax]
bool
flipXY
=
false
;
// NMS
if
(
rotated
)
{
status
=
allClassRotatedNMS
(
stream
,
N
,
numClasses
,
numPredsPerClass
,
topKVal
,
iouThreshold
,
shareLocation
,
isNormalized
,
DataType
::
kFLOAT
,
DataType
::
kFLOAT
,
bboxData
,
scores
,
indices
,
postNMSScores
,
postNMSIndices
,
flipXY
);
}
else
{
status
=
allClassNMS
(
stream
,
N
,
numClasses
,
numPredsPerClass
,
topKVal
,
iouThreshold
,
shareLocation
,
isNormalized
,
DataType
::
kFLOAT
,
DataType
::
kFLOAT
,
bboxData
,
scores
,
indices
,
postNMSScores
,
postNMSIndices
,
flipXY
);
}
ASSERT_FAILURE
(
status
==
STATUS_SUCCESS
);
// Sort the bounding boxes after NMS using scores
status
=
sortScoresPerImage
(
stream
,
N
,
numClasses
*
topKVal
,
DataType
::
kFLOAT
,
postNMSScores
,
postNMSIndices
,
scores
,
indices
,
sortingWorkspace
);
ASSERT_FAILURE
(
status
==
STATUS_SUCCESS
);
// Gather data from the sorted bounding boxes after NMS
status
=
gatherNMSOutputs
(
stream
,
shareLocation
,
N
,
numPredsPerClass
,
numClasses
,
topKVal
,
keepTopKVal
,
DataType
::
kFLOAT
,
DataType
::
kFLOAT
,
indices
,
scores
,
bboxData
,
nmsedDets
,
nmsedLabels
,
nmsedIndex
,
clipBoxes
,
rotated
);
ASSERT_FAILURE
(
status
==
STATUS_SUCCESS
);
return
STATUS_SUCCESS
;
}
csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/gatherNMSOutputs.cu
0 → 100644
View file @
546b4279
// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
// modify from
// https://github.com/NVIDIA/TensorRT/tree/master/plugin/batchedNMSPlugin
#include <vector>
#include "nms/kernel.h"
#include "trt_plugin_helper.hpp"
template
<
typename
T_BBOX
,
typename
T_SCORE
,
bool
rotated
,
unsigned
nthds_per_cta
>
__launch_bounds__
(
nthds_per_cta
)
__global__
void
gatherNMSOutputs_kernel
(
const
bool
shareLocation
,
const
int
numImages
,
const
int
numPredsPerClass
,
const
int
numClasses
,
const
int
topK
,
const
int
keepTopK
,
const
int
*
indices
,
const
T_SCORE
*
scores
,
const
T_BBOX
*
bboxData
,
T_BBOX
*
nmsedDets
,
int
*
nmsedLabels
,
int
*
nmsedIndex
,
bool
clipBoxes
)
{
if
(
keepTopK
>
topK
)
return
;
for
(
int
i
=
blockIdx
.
x
*
nthds_per_cta
+
threadIdx
.
x
;
i
<
numImages
*
keepTopK
;
i
+=
gridDim
.
x
*
nthds_per_cta
)
{
const
int
imgId
=
i
/
keepTopK
;
const
int
detId
=
i
%
keepTopK
;
const
int
offset
=
imgId
*
numClasses
*
topK
;
const
int
index
=
indices
[
offset
+
detId
];
const
T_SCORE
score
=
scores
[
offset
+
detId
];
if
(
index
==
-
1
)
{
nmsedLabels
[
i
]
=
-
1
;
if
(
nmsedIndex
!=
nullptr
)
{
nmsedIndex
[
i
]
=
-
1
;
}
if
(
rotated
)
{
nmsedDets
[
i
*
6
]
=
0
;
nmsedDets
[
i
*
6
+
1
]
=
0
;
nmsedDets
[
i
*
6
+
2
]
=
0
;
nmsedDets
[
i
*
6
+
3
]
=
0
;
nmsedDets
[
i
*
6
+
4
]
=
0
;
nmsedDets
[
i
*
6
+
5
]
=
0
;
}
else
{
nmsedDets
[
i
*
5
]
=
0
;
nmsedDets
[
i
*
5
+
1
]
=
0
;
nmsedDets
[
i
*
5
+
2
]
=
0
;
nmsedDets
[
i
*
5
+
3
]
=
0
;
nmsedDets
[
i
*
5
+
4
]
=
0
;
}
}
else
{
const
int
bboxOffset
=
imgId
*
(
shareLocation
?
numPredsPerClass
:
(
numClasses
*
numPredsPerClass
));
nmsedLabels
[
i
]
=
(
index
%
(
numClasses
*
numPredsPerClass
))
/
numPredsPerClass
;
// label
if
(
rotated
)
{
const
int
bboxId
=
((
shareLocation
?
(
index
%
numPredsPerClass
)
:
index
%
(
numClasses
*
numPredsPerClass
))
+
bboxOffset
)
*
5
;
if
(
nmsedIndex
!=
nullptr
)
{
nmsedIndex
[
i
]
=
bboxId
/
5
-
bboxOffset
;
}
// clipped bbox xmin
nmsedDets
[
i
*
6
]
=
clipBoxes
?
max
(
min
(
bboxData
[
bboxId
],
T_BBOX
(
1.
)),
T_BBOX
(
0.
))
:
bboxData
[
bboxId
];
// clipped bbox ymin
nmsedDets
[
i
*
6
+
1
]
=
clipBoxes
?
max
(
min
(
bboxData
[
bboxId
+
1
],
T_BBOX
(
1.
)),
T_BBOX
(
0.
))
:
bboxData
[
bboxId
+
1
];
// clipped bbox xmax
nmsedDets
[
i
*
6
+
2
]
=
clipBoxes
?
max
(
min
(
bboxData
[
bboxId
+
2
],
T_BBOX
(
1.
)),
T_BBOX
(
0.
))
:
bboxData
[
bboxId
+
2
];
// clipped bbox ymax
nmsedDets
[
i
*
6
+
3
]
=
clipBoxes
?
max
(
min
(
bboxData
[
bboxId
+
3
],
T_BBOX
(
1.
)),
T_BBOX
(
0.
))
:
bboxData
[
bboxId
+
3
];
// clipped bbox angle
nmsedDets
[
i
*
6
+
4
]
=
clipBoxes
?
max
(
min
(
bboxData
[
bboxId
+
4
],
T_BBOX
(
1.
)),
T_BBOX
(
0.
))
:
bboxData
[
bboxId
+
4
];
nmsedDets
[
i
*
6
+
5
]
=
score
;
}
else
{
const
int
bboxId
=
((
shareLocation
?
(
index
%
numPredsPerClass
)
:
index
%
(
numClasses
*
numPredsPerClass
))
+
bboxOffset
)
*
4
;
if
(
nmsedIndex
!=
nullptr
)
{
nmsedIndex
[
i
]
=
bboxId
/
4
-
bboxOffset
;
}
// clipped bbox xmin
nmsedDets
[
i
*
5
]
=
clipBoxes
?
max
(
min
(
bboxData
[
bboxId
],
T_BBOX
(
1.
)),
T_BBOX
(
0.
))
:
bboxData
[
bboxId
];
// clipped bbox ymin
nmsedDets
[
i
*
5
+
1
]
=
clipBoxes
?
max
(
min
(
bboxData
[
bboxId
+
1
],
T_BBOX
(
1.
)),
T_BBOX
(
0.
))
:
bboxData
[
bboxId
+
1
];
// clipped bbox xmax
nmsedDets
[
i
*
5
+
2
]
=
clipBoxes
?
max
(
min
(
bboxData
[
bboxId
+
2
],
T_BBOX
(
1.
)),
T_BBOX
(
0.
))
:
bboxData
[
bboxId
+
2
];
// clipped bbox ymax
nmsedDets
[
i
*
5
+
3
]
=
clipBoxes
?
max
(
min
(
bboxData
[
bboxId
+
3
],
T_BBOX
(
1.
)),
T_BBOX
(
0.
))
:
bboxData
[
bboxId
+
3
];
nmsedDets
[
i
*
5
+
4
]
=
score
;
}
}
}
}
template
<
typename
T_BBOX
,
typename
T_SCORE
,
bool
rotated
>
pluginStatus_t
gatherNMSOutputs_gpu
(
cudaStream_t
stream
,
const
bool
shareLocation
,
const
int
numImages
,
const
int
numPredsPerClass
,
const
int
numClasses
,
const
int
topK
,
const
int
keepTopK
,
const
void
*
indices
,
const
void
*
scores
,
const
void
*
bboxData
,
void
*
nmsedDets
,
void
*
nmsedLabels
,
void
*
nmsedIndex
,
bool
clipBoxes
)
{
const
int
BS
=
32
;
const
int
GS
=
32
;
gatherNMSOutputs_kernel
<
T_BBOX
,
T_SCORE
,
rotated
,
BS
><<<
GS
,
BS
,
0
,
stream
>>>
(
shareLocation
,
numImages
,
numPredsPerClass
,
numClasses
,
topK
,
keepTopK
,
(
int
*
)
indices
,
(
T_SCORE
*
)
scores
,
(
T_BBOX
*
)
bboxData
,
(
T_BBOX
*
)
nmsedDets
,
(
int
*
)
nmsedLabels
,
(
int
*
)
nmsedIndex
,
clipBoxes
);
CSC
(
cudaGetLastError
(),
STATUS_FAILURE
);
return
STATUS_SUCCESS
;
}
// gatherNMSOutputs LAUNCH CONFIG {{{
typedef
pluginStatus_t
(
*
nmsOutFunc
)(
cudaStream_t
,
const
bool
,
const
int
,
const
int
,
const
int
,
const
int
,
const
int
,
const
void
*
,
const
void
*
,
const
void
*
,
void
*
,
void
*
,
void
*
,
bool
);
struct
nmsOutLaunchConfig
{
DataType
t_bbox
;
DataType
t_score
;
bool
rotated
;
nmsOutFunc
function
;
nmsOutLaunchConfig
(
DataType
t_bbox
,
DataType
t_score
,
bool
rotated
)
:
t_bbox
(
t_bbox
),
t_score
(
t_score
),
rotated
(
rotated
)
{}
nmsOutLaunchConfig
(
DataType
t_bbox
,
DataType
t_score
,
bool
rotated
,
nmsOutFunc
function
)
:
t_bbox
(
t_bbox
),
t_score
(
t_score
),
rotated
(
rotated
),
function
(
function
)
{}
bool
operator
==
(
const
nmsOutLaunchConfig
&
other
)
{
return
t_bbox
==
other
.
t_bbox
&&
t_score
==
other
.
t_score
&&
rotated
==
other
.
rotated
;
}
};
using
nvinfer1
::
DataType
;
static
std
::
vector
<
nmsOutLaunchConfig
>
nmsOutFuncVec
;
bool
nmsOutputInit
()
{
nmsOutFuncVec
.
push_back
(
nmsOutLaunchConfig
(
DataType
::
kFLOAT
,
DataType
::
kFLOAT
,
false
,
gatherNMSOutputs_gpu
<
float
,
float
,
false
>
));
nmsOutFuncVec
.
push_back
(
nmsOutLaunchConfig
(
DataType
::
kFLOAT
,
DataType
::
kFLOAT
,
true
,
gatherNMSOutputs_gpu
<
float
,
float
,
true
>
));
return
true
;
}
static
bool
initialized
=
nmsOutputInit
();
pluginStatus_t
gatherNMSOutputs
(
cudaStream_t
stream
,
const
bool
shareLocation
,
const
int
numImages
,
const
int
numPredsPerClass
,
const
int
numClasses
,
const
int
topK
,
const
int
keepTopK
,
const
DataType
DT_BBOX
,
const
DataType
DT_SCORE
,
const
void
*
indices
,
const
void
*
scores
,
const
void
*
bboxData
,
void
*
nmsedDets
,
void
*
nmsedLabels
,
void
*
nmsedIndex
,
bool
clipBoxes
,
bool
rotated
)
{
nmsOutLaunchConfig
lc
=
nmsOutLaunchConfig
(
DT_BBOX
,
DT_SCORE
,
rotated
);
for
(
unsigned
i
=
0
;
i
<
nmsOutFuncVec
.
size
();
++
i
)
{
if
(
lc
==
nmsOutFuncVec
[
i
])
{
DEBUG_PRINTF
(
"gatherNMSOutputs kernel %d
\n
"
,
i
);
return
nmsOutFuncVec
[
i
].
function
(
stream
,
shareLocation
,
numImages
,
numPredsPerClass
,
numClasses
,
topK
,
keepTopK
,
indices
,
scores
,
bboxData
,
nmsedDets
,
nmsedLabels
,
nmsedIndex
,
clipBoxes
);
}
}
return
STATUS_BAD_PARAM
;
}
csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/kernel.cu
0 → 100644
View file @
546b4279
// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
// modify from
// https://github.com/NVIDIA/TensorRT/tree/master/plugin/batchedNMSPlugin
#include <stdint.h>
#include <cub/cub.cuh>
#include "cublas_v2.h"
#include "nms/kernel.h"
#include "trt_plugin_helper.hpp"
#define CUDA_MEM_ALIGN 256
// return cuda arch
size_t
get_cuda_arch
(
int
devID
)
{
int
computeMode
=
-
1
,
major
=
0
,
minor
=
0
;
CUASSERT
(
cudaDeviceGetAttribute
(
&
computeMode
,
cudaDevAttrComputeMode
,
devID
));
CUASSERT
(
cudaDeviceGetAttribute
(
&
major
,
cudaDevAttrComputeCapabilityMajor
,
devID
));
CUASSERT
(
cudaDeviceGetAttribute
(
&
minor
,
cudaDevAttrComputeCapabilityMinor
,
devID
));
return
major
*
100
+
minor
*
10
;
}
// ALIGNPTR
int8_t
*
alignPtr
(
int8_t
*
ptr
,
uintptr_t
to
)
{
uintptr_t
addr
=
(
uintptr_t
)
ptr
;
if
(
addr
%
to
)
{
addr
+=
to
-
addr
%
to
;
}
return
(
int8_t
*
)
addr
;
}
// NEXTWORKSPACEPTR
int8_t
*
nextWorkspacePtr
(
int8_t
*
ptr
,
uintptr_t
previousWorkspaceSize
)
{
uintptr_t
addr
=
(
uintptr_t
)
ptr
;
addr
+=
previousWorkspaceSize
;
return
alignPtr
((
int8_t
*
)
addr
,
CUDA_MEM_ALIGN
);
}
// CALCULATE TOTAL WORKSPACE SIZE
size_t
calculateTotalWorkspaceSize
(
size_t
*
workspaces
,
int
count
)
{
size_t
total
=
0
;
for
(
int
i
=
0
;
i
<
count
;
i
++
)
{
total
+=
workspaces
[
i
];
if
(
workspaces
[
i
]
%
CUDA_MEM_ALIGN
)
{
total
+=
CUDA_MEM_ALIGN
-
(
workspaces
[
i
]
%
CUDA_MEM_ALIGN
);
}
}
return
total
;
}
using
nvinfer1
::
DataType
;
template
<
unsigned
nthds_per_cta
>
__launch_bounds__
(
nthds_per_cta
)
__global__
void
setUniformOffsets_kernel
(
const
int
num_segments
,
const
int
offset
,
int
*
d_offsets
)
{
const
int
idx
=
blockIdx
.
x
*
nthds_per_cta
+
threadIdx
.
x
;
if
(
idx
<=
num_segments
)
d_offsets
[
idx
]
=
idx
*
offset
;
}
void
setUniformOffsets
(
cudaStream_t
stream
,
const
int
num_segments
,
const
int
offset
,
int
*
d_offsets
)
{
const
int
BS
=
32
;
const
int
GS
=
(
num_segments
+
1
+
BS
-
1
)
/
BS
;
setUniformOffsets_kernel
<
BS
><<<
GS
,
BS
,
0
,
stream
>>>
(
num_segments
,
offset
,
d_offsets
);
}
size_t
detectionForwardBBoxDataSize
(
int
N
,
int
C1
,
DataType
DT_BBOX
)
{
if
(
DT_BBOX
==
DataType
::
kFLOAT
)
{
return
N
*
C1
*
sizeof
(
float
);
}
printf
(
"Only FP32 type bounding boxes are supported.
\n
"
);
return
(
size_t
)
-
1
;
}
size_t
detectionForwardBBoxPermuteSize
(
bool
shareLocation
,
int
N
,
int
C1
,
DataType
DT_BBOX
)
{
if
(
DT_BBOX
==
DataType
::
kFLOAT
)
{
return
shareLocation
?
0
:
N
*
C1
*
sizeof
(
float
);
}
printf
(
"Only FP32 type bounding boxes are supported.
\n
"
);
return
(
size_t
)
-
1
;
}
size_t
detectionForwardPreNMSSize
(
int
N
,
int
C2
)
{
ASSERT
(
sizeof
(
float
)
==
sizeof
(
int
));
return
N
*
C2
*
sizeof
(
float
);
}
size_t
detectionForwardPostNMSSize
(
int
N
,
int
numClasses
,
int
topK
)
{
ASSERT
(
sizeof
(
float
)
==
sizeof
(
int
));
return
N
*
numClasses
*
topK
*
sizeof
(
float
);
}
size_t
detectionInferenceWorkspaceSize
(
bool
shareLocation
,
int
N
,
int
C1
,
int
C2
,
int
numClasses
,
int
numPredsPerClass
,
int
topK
,
DataType
DT_BBOX
,
DataType
DT_SCORE
)
{
size_t
wss
[
7
];
wss
[
0
]
=
detectionForwardBBoxDataSize
(
N
,
C1
,
DT_BBOX
);
wss
[
1
]
=
detectionForwardBBoxPermuteSize
(
shareLocation
,
N
,
C1
,
DT_BBOX
);
wss
[
2
]
=
detectionForwardPreNMSSize
(
N
,
C2
);
wss
[
3
]
=
detectionForwardPreNMSSize
(
N
,
C2
);
wss
[
4
]
=
detectionForwardPostNMSSize
(
N
,
numClasses
,
topK
);
wss
[
5
]
=
detectionForwardPostNMSSize
(
N
,
numClasses
,
topK
);
wss
[
6
]
=
std
::
max
(
sortScoresPerClassWorkspaceSize
(
N
,
numClasses
,
numPredsPerClass
,
DT_SCORE
),
sortScoresPerImageWorkspaceSize
(
N
,
numClasses
*
topK
,
DT_SCORE
));
return
calculateTotalWorkspaceSize
(
wss
,
7
);
}
csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/permuteData.cu
0 → 100644
View file @
546b4279
// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
// modify from
// https://github.com/NVIDIA/TensorRT/tree/master/plugin/batchedNMSPlugin
#include <vector>
#include "nms/kernel.h"
template
<
typename
Dtype
,
unsigned
nthds_per_cta
>
__launch_bounds__
(
nthds_per_cta
)
__global__
void
permuteData_kernel
(
const
int
nthreads
,
const
int
num_classes
,
const
int
num_data
,
const
int
num_dim
,
bool
confSigmoid
,
const
Dtype
*
data
,
Dtype
*
new_data
)
{
// data format: [batch_size, num_data, num_classes, num_dim]
for
(
int
index
=
blockIdx
.
x
*
nthds_per_cta
+
threadIdx
.
x
;
index
<
nthreads
;
index
+=
nthds_per_cta
*
gridDim
.
x
)
{
const
int
i
=
index
%
num_dim
;
const
int
c
=
(
index
/
num_dim
)
%
num_classes
;
const
int
d
=
(
index
/
num_dim
/
num_classes
)
%
num_data
;
const
int
n
=
index
/
num_dim
/
num_classes
/
num_data
;
const
int
new_index
=
((
n
*
num_classes
+
c
)
*
num_data
+
d
)
*
num_dim
+
i
;
float
result
=
data
[
index
];
if
(
confSigmoid
)
result
=
exp
(
result
)
/
(
1
+
exp
(
result
));
new_data
[
new_index
]
=
result
;
}
// new data format: [batch_size, num_classes, num_data, num_dim]
}
template
<
typename
Dtype
>
pluginStatus_t
permuteData_gpu
(
cudaStream_t
stream
,
const
int
nthreads
,
const
int
num_classes
,
const
int
num_data
,
const
int
num_dim
,
bool
confSigmoid
,
const
void
*
data
,
void
*
new_data
)
{
const
int
BS
=
512
;
const
int
GS
=
(
nthreads
+
BS
-
1
)
/
BS
;
permuteData_kernel
<
Dtype
,
BS
><<<
GS
,
BS
,
0
,
stream
>>>
(
nthreads
,
num_classes
,
num_data
,
num_dim
,
confSigmoid
,
(
const
Dtype
*
)
data
,
(
Dtype
*
)
new_data
);
CSC
(
cudaGetLastError
(),
STATUS_FAILURE
);
return
STATUS_SUCCESS
;
}
// permuteData LAUNCH CONFIG
typedef
pluginStatus_t
(
*
pdFunc
)(
cudaStream_t
,
const
int
,
const
int
,
const
int
,
const
int
,
bool
,
const
void
*
,
void
*
);
struct
pdLaunchConfig
{
DataType
t_data
;
pdFunc
function
;
pdLaunchConfig
(
DataType
t_data
)
:
t_data
(
t_data
)
{}
pdLaunchConfig
(
DataType
t_data
,
pdFunc
function
)
:
t_data
(
t_data
),
function
(
function
)
{}
bool
operator
==
(
const
pdLaunchConfig
&
other
)
{
return
t_data
==
other
.
t_data
;
}
};
static
std
::
vector
<
pdLaunchConfig
>
pdFuncVec
;
bool
permuteDataInit
()
{
pdFuncVec
.
push_back
(
pdLaunchConfig
(
DataType
::
kFLOAT
,
permuteData_gpu
<
float
>
));
return
true
;
}
static
bool
initialized
=
permuteDataInit
();
pluginStatus_t
permuteData
(
cudaStream_t
stream
,
const
int
nthreads
,
const
int
num_classes
,
const
int
num_data
,
const
int
num_dim
,
const
DataType
DT_DATA
,
bool
confSigmoid
,
const
void
*
data
,
void
*
new_data
)
{
pdLaunchConfig
lc
=
pdLaunchConfig
(
DT_DATA
);
for
(
unsigned
i
=
0
;
i
<
pdFuncVec
.
size
();
++
i
)
{
if
(
lc
==
pdFuncVec
[
i
])
{
DEBUG_PRINTF
(
"permuteData kernel %d
\n
"
,
i
);
return
pdFuncVec
[
i
].
function
(
stream
,
nthreads
,
num_classes
,
num_data
,
num_dim
,
confSigmoid
,
data
,
new_data
);
}
}
return
STATUS_BAD_PARAM
;
}
csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/sortScoresPerClass.cu
0 → 100644
View file @
546b4279
// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
// modify from
// https://github.com/NVIDIA/TensorRT/tree/master/plugin/batchedNMSPlugin
#include <vector>
#include "cub/cub.cuh"
#include "nms/cub_helper.h"
#include "nms/kernel.h"
#include "trt_plugin_helper.hpp"
template
<
typename
T_SCORE
,
unsigned
nthds_per_cta
>
__launch_bounds__
(
nthds_per_cta
)
__global__
void
prepareSortData
(
const
int
num
,
const
int
num_classes
,
const
int
num_preds_per_class
,
const
int
background_label_id
,
const
float
confidence_threshold
,
T_SCORE
*
conf_scores_gpu
,
T_SCORE
*
temp_scores
,
int
*
temp_idx
,
int
*
d_offsets
)
{
// Prepare scores data for sort
const
int
cur_idx
=
blockIdx
.
x
*
nthds_per_cta
+
threadIdx
.
x
;
const
int
numPredsPerBatch
=
num_classes
*
num_preds_per_class
;
if
(
cur_idx
<
numPredsPerBatch
)
{
const
int
class_idx
=
cur_idx
/
num_preds_per_class
;
for
(
int
i
=
0
;
i
<
num
;
i
++
)
{
const
int
targetIdx
=
i
*
numPredsPerBatch
+
cur_idx
;
const
T_SCORE
score
=
conf_scores_gpu
[
targetIdx
];
// "Clear" background labeled score and index
// Because we do not care about background
if
(
class_idx
==
background_label_id
)
{
// Set scores to 0
// Set label = -1
temp_scores
[
targetIdx
]
=
0.0
f
;
temp_idx
[
targetIdx
]
=
-
1
;
conf_scores_gpu
[
targetIdx
]
=
0.0
f
;
}
// "Clear" scores lower than threshold
else
{
if
(
score
>
confidence_threshold
)
{
temp_scores
[
targetIdx
]
=
score
;
temp_idx
[
targetIdx
]
=
cur_idx
+
i
*
numPredsPerBatch
;
}
else
{
// Set scores to 0
// Set label = -1
temp_scores
[
targetIdx
]
=
0.0
f
;
temp_idx
[
targetIdx
]
=
-
1
;
conf_scores_gpu
[
targetIdx
]
=
0.0
f
;
// TODO: HERE writing memory too many times
}
}
if
((
cur_idx
%
num_preds_per_class
)
==
0
)
{
const
int
offset_ct
=
i
*
num_classes
+
cur_idx
/
num_preds_per_class
;
d_offsets
[
offset_ct
]
=
offset_ct
*
num_preds_per_class
;
// set the last element in d_offset
if
(
blockIdx
.
x
==
0
&&
threadIdx
.
x
==
0
)
d_offsets
[
num
*
num_classes
]
=
num
*
numPredsPerBatch
;
}
}
}
}
template
<
typename
T_SCORE
>
pluginStatus_t
sortScoresPerClass_gpu
(
cudaStream_t
stream
,
const
int
num
,
const
int
num_classes
,
const
int
num_preds_per_class
,
const
int
background_label_id
,
const
float
confidence_threshold
,
void
*
conf_scores_gpu
,
void
*
index_array_gpu
,
void
*
workspace
)
{
const
int
num_segments
=
num
*
num_classes
;
void
*
temp_scores
=
workspace
;
const
int
arrayLen
=
num
*
num_classes
*
num_preds_per_class
;
void
*
temp_idx
=
nextWorkspacePtr
((
int8_t
*
)
temp_scores
,
arrayLen
*
sizeof
(
T_SCORE
));
void
*
d_offsets
=
nextWorkspacePtr
((
int8_t
*
)
temp_idx
,
arrayLen
*
sizeof
(
int
));
size_t
cubOffsetSize
=
(
num_segments
+
1
)
*
sizeof
(
int
);
void
*
cubWorkspace
=
nextWorkspacePtr
((
int8_t
*
)
d_offsets
,
cubOffsetSize
);
const
int
BS
=
512
;
const
int
GS
=
(
num_classes
*
num_preds_per_class
+
BS
-
1
)
/
BS
;
prepareSortData
<
T_SCORE
,
BS
><<<
GS
,
BS
,
0
,
stream
>>>
(
num
,
num_classes
,
num_preds_per_class
,
background_label_id
,
confidence_threshold
,
(
T_SCORE
*
)
conf_scores_gpu
,
(
T_SCORE
*
)
temp_scores
,
(
int
*
)
temp_idx
,
(
int
*
)
d_offsets
);
size_t
temp_storage_bytes
=
cubSortPairsWorkspaceSize
<
T_SCORE
,
int
>
(
arrayLen
,
num_segments
);
cub
::
DeviceSegmentedRadixSort
::
SortPairsDescending
(
cubWorkspace
,
temp_storage_bytes
,
(
const
T_SCORE
*
)(
temp_scores
),
(
T_SCORE
*
)(
conf_scores_gpu
),
(
const
int
*
)(
temp_idx
),
(
int
*
)(
index_array_gpu
),
arrayLen
,
num_segments
,
(
const
int
*
)
d_offsets
,
(
const
int
*
)
d_offsets
+
1
,
0
,
sizeof
(
T_SCORE
)
*
8
,
stream
);
CSC
(
cudaGetLastError
(),
STATUS_FAILURE
);
return
STATUS_SUCCESS
;
}
// sortScoresPerClass LAUNCH CONFIG
typedef
pluginStatus_t
(
*
sspcFunc
)(
cudaStream_t
,
const
int
,
const
int
,
const
int
,
const
int
,
const
float
,
void
*
,
void
*
,
void
*
);
struct
sspcLaunchConfig
{
DataType
t_score
;
sspcFunc
function
;
sspcLaunchConfig
(
DataType
t_score
)
:
t_score
(
t_score
)
{}
sspcLaunchConfig
(
DataType
t_score
,
sspcFunc
function
)
:
t_score
(
t_score
),
function
(
function
)
{}
bool
operator
==
(
const
sspcLaunchConfig
&
other
)
{
return
t_score
==
other
.
t_score
;
}
};
static
std
::
vector
<
sspcLaunchConfig
>
sspcFuncVec
;
bool
sspcInit
()
{
sspcFuncVec
.
push_back
(
sspcLaunchConfig
(
DataType
::
kFLOAT
,
sortScoresPerClass_gpu
<
float
>
));
return
true
;
}
static
bool
initialized
=
sspcInit
();
pluginStatus_t
sortScoresPerClass
(
cudaStream_t
stream
,
const
int
num
,
const
int
num_classes
,
const
int
num_preds_per_class
,
const
int
background_label_id
,
const
float
confidence_threshold
,
const
DataType
DT_SCORE
,
void
*
conf_scores_gpu
,
void
*
index_array_gpu
,
void
*
workspace
)
{
sspcLaunchConfig
lc
=
sspcLaunchConfig
(
DT_SCORE
);
for
(
unsigned
i
=
0
;
i
<
sspcFuncVec
.
size
();
++
i
)
{
if
(
lc
==
sspcFuncVec
[
i
])
{
DEBUG_PRINTF
(
"sortScoresPerClass kernel %d
\n
"
,
i
);
return
sspcFuncVec
[
i
].
function
(
stream
,
num
,
num_classes
,
num_preds_per_class
,
background_label_id
,
confidence_threshold
,
conf_scores_gpu
,
index_array_gpu
,
workspace
);
}
}
return
STATUS_BAD_PARAM
;
}
size_t
sortScoresPerClassWorkspaceSize
(
const
int
num
,
const
int
num_classes
,
const
int
num_preds_per_class
,
const
DataType
DT_CONF
)
{
size_t
wss
[
4
];
const
int
arrayLen
=
num
*
num_classes
*
num_preds_per_class
;
wss
[
0
]
=
arrayLen
*
mmdeploy
::
getElementSize
(
DT_CONF
);
// temp scores
wss
[
1
]
=
arrayLen
*
sizeof
(
int
);
// temp indices
wss
[
2
]
=
(
num
*
num_classes
+
1
)
*
sizeof
(
int
);
// offsets
if
(
DT_CONF
==
DataType
::
kFLOAT
)
{
wss
[
3
]
=
cubSortPairsWorkspaceSize
<
float
,
int
>
(
arrayLen
,
num
*
num_classes
);
// cub workspace
}
else
{
printf
(
"SCORE type not supported
\n
"
);
return
(
size_t
)
-
1
;
}
return
calculateTotalWorkspaceSize
(
wss
,
4
);
}
csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/sortScoresPerImage.cu
0 → 100644
View file @
546b4279
// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
// modify from
// https://github.com/NVIDIA/TensorRT/tree/master/plugin/batchedNMSPlugin
#include <vector>
#include "cub/cub.cuh"
#include "nms/cub_helper.h"
#include "nms/kernel.h"
template
<
typename
T_SCORE
>
pluginStatus_t
sortScoresPerImage_gpu
(
cudaStream_t
stream
,
const
int
num_images
,
const
int
num_items_per_image
,
void
*
unsorted_scores
,
void
*
unsorted_bbox_indices
,
void
*
sorted_scores
,
void
*
sorted_bbox_indices
,
void
*
workspace
)
{
void
*
d_offsets
=
workspace
;
void
*
cubWorkspace
=
nextWorkspacePtr
((
int8_t
*
)
d_offsets
,
(
num_images
+
1
)
*
sizeof
(
int
));
setUniformOffsets
(
stream
,
num_images
,
num_items_per_image
,
(
int
*
)
d_offsets
);
const
int
arrayLen
=
num_images
*
num_items_per_image
;
size_t
temp_storage_bytes
=
cubSortPairsWorkspaceSize
<
T_SCORE
,
int
>
(
arrayLen
,
num_images
);
cub
::
DeviceSegmentedRadixSort
::
SortPairsDescending
(
cubWorkspace
,
temp_storage_bytes
,
(
const
T_SCORE
*
)(
unsorted_scores
),
(
T_SCORE
*
)(
sorted_scores
),
(
const
int
*
)(
unsorted_bbox_indices
),
(
int
*
)(
sorted_bbox_indices
),
arrayLen
,
num_images
,
(
const
int
*
)
d_offsets
,
(
const
int
*
)
d_offsets
+
1
,
0
,
sizeof
(
T_SCORE
)
*
8
,
stream
);
CSC
(
cudaGetLastError
(),
STATUS_FAILURE
);
return
STATUS_SUCCESS
;
}
// sortScoresPerImage LAUNCH CONFIG
typedef
pluginStatus_t
(
*
sspiFunc
)(
cudaStream_t
,
const
int
,
const
int
,
void
*
,
void
*
,
void
*
,
void
*
,
void
*
);
struct
sspiLaunchConfig
{
DataType
t_score
;
sspiFunc
function
;
sspiLaunchConfig
(
DataType
t_score
)
:
t_score
(
t_score
)
{}
sspiLaunchConfig
(
DataType
t_score
,
sspiFunc
function
)
:
t_score
(
t_score
),
function
(
function
)
{}
bool
operator
==
(
const
sspiLaunchConfig
&
other
)
{
return
t_score
==
other
.
t_score
;
}
};
static
std
::
vector
<
sspiLaunchConfig
>
sspiFuncVec
;
bool
sspiInit
()
{
sspiFuncVec
.
push_back
(
sspiLaunchConfig
(
DataType
::
kFLOAT
,
sortScoresPerImage_gpu
<
float
>
));
return
true
;
}
static
bool
initialized
=
sspiInit
();
pluginStatus_t
sortScoresPerImage
(
cudaStream_t
stream
,
const
int
num_images
,
const
int
num_items_per_image
,
const
DataType
DT_SCORE
,
void
*
unsorted_scores
,
void
*
unsorted_bbox_indices
,
void
*
sorted_scores
,
void
*
sorted_bbox_indices
,
void
*
workspace
)
{
sspiLaunchConfig
lc
=
sspiLaunchConfig
(
DT_SCORE
);
for
(
unsigned
i
=
0
;
i
<
sspiFuncVec
.
size
();
++
i
)
{
if
(
lc
==
sspiFuncVec
[
i
])
{
DEBUG_PRINTF
(
"sortScoresPerImage kernel %d
\n
"
,
i
);
return
sspiFuncVec
[
i
].
function
(
stream
,
num_images
,
num_items_per_image
,
unsorted_scores
,
unsorted_bbox_indices
,
sorted_scores
,
sorted_bbox_indices
,
workspace
);
}
}
return
STATUS_BAD_PARAM
;
}
size_t
sortScoresPerImageWorkspaceSize
(
const
int
num_images
,
const
int
num_items_per_image
,
const
DataType
DT_SCORE
)
{
const
int
arrayLen
=
num_images
*
num_items_per_image
;
size_t
wss
[
2
];
wss
[
0
]
=
(
num_images
+
1
)
*
sizeof
(
int
);
// offsets
if
(
DT_SCORE
==
DataType
::
kFLOAT
)
{
wss
[
1
]
=
cubSortPairsWorkspaceSize
<
float
,
int
>
(
arrayLen
,
num_images
);
// cub workspace
}
else
{
printf
(
"SCORE type not supported.
\n
"
);
return
(
size_t
)
-
1
;
}
return
calculateTotalWorkspaceSize
(
wss
,
2
);
}
csrc/mmdeploy/backend_ops/tensorrt/common_impl/trt_cuda_helper.cu
0 → 100644
View file @
546b4279
// Copyright (c) OpenMMLab. All rights reserved.
#include "common_cuda_helper.hpp"
#include "trt_plugin_helper.hpp"
using
mmdeploy
::
TensorDesc
;
template
<
class
scalar_t
>
__global__
void
copy_permute_kernel
(
scalar_t
*
__restrict__
dst
,
const
scalar_t
*
__restrict__
src
,
int
n
,
TensorDesc
ts_src_stride
,
TensorDesc
ts_dst_stride
,
TensorDesc
ts_permute
)
{
const
int
src_dim
=
ts_src_stride
.
dim
;
const
auto
src_stride
=
ts_src_stride
.
stride
;
const
auto
dst_stride
=
ts_dst_stride
.
stride
;
const
auto
permute
=
ts_permute
.
shape
;
CUDA_1D_KERNEL_LOOP
(
index
,
n
)
{
size_t
dst_index
=
index
;
size_t
src_index
=
0
;
for
(
int
i
=
0
;
i
<
src_dim
;
++
i
)
{
int
dim_index
=
dst_index
/
dst_stride
[
i
];
dst_index
=
dst_index
%
dst_stride
[
i
];
src_index
+=
dim_index
*
src_stride
[
permute
[
i
]];
}
dst
[
index
]
=
src
[
src_index
];
}
}
template
<
class
scalar_t
>
void
memcpyPermute
(
scalar_t
*
dst
,
const
scalar_t
*
src
,
int
*
src_size
,
int
*
permute
,
int
src_dim
,
cudaStream_t
stream
)
{
size_t
copy_size
=
1
;
TensorDesc
ts_permute
;
memcpy
(
&
(
ts_permute
.
shape
[
0
]),
permute
,
src_dim
*
sizeof
(
int
));
TensorDesc
ts_src_stride
;
TensorDesc
ts_dst_stride
;
ts_src_stride
.
dim
=
src_dim
;
ts_dst_stride
.
dim
=
src_dim
;
int
*
src_stride
=
&
(
ts_src_stride
.
stride
[
0
]);
int
*
dst_stride
=
&
(
ts_dst_stride
.
stride
[
0
]);
int
*
dst_size
=
&
(
ts_dst_stride
.
shape
[
0
]);
src_stride
[
src_dim
-
1
]
=
1
;
dst_stride
[
src_dim
-
1
]
=
1
;
for
(
int
i
=
src_dim
-
1
;
i
>=
0
;
--
i
)
{
dst_size
[
i
]
=
src_size
[
permute
[
i
]];
if
(
i
<
src_dim
-
1
)
{
src_stride
[
i
]
=
src_stride
[
i
+
1
]
*
src_size
[
i
+
1
];
}
}
for
(
int
i
=
src_dim
-
1
;
i
>=
0
;
--
i
)
{
copy_size
*=
dst_size
[
i
];
if
(
i
<
src_dim
-
1
)
{
dst_stride
[
i
]
=
dst_stride
[
i
+
1
]
*
dst_size
[
i
+
1
];
}
}
copy_permute_kernel
<
scalar_t
><<<
GET_BLOCKS
(
copy_size
),
THREADS_PER_BLOCK
,
0
,
stream
>>>
(
dst
,
src
,
copy_size
,
ts_src_stride
,
ts_dst_stride
,
ts_permute
);
}
template
void
memcpyPermute
<
float
>(
float
*
dst
,
const
float
*
src
,
int
*
src_size
,
int
*
permute
,
int
src_dim
,
cudaStream_t
stream
);
template
void
memcpyPermute
<
half
>(
half
*
dst
,
const
half
*
src
,
int
*
src_size
,
int
*
permute
,
int
src_dim
,
cudaStream_t
stream
);
cudnnStatus_t
convert_trt2cudnn_dtype
(
nvinfer1
::
DataType
trt_dtype
,
cudnnDataType_t
*
cudnn_dtype
)
{
switch
(
trt_dtype
)
{
case
nvinfer1
::
DataType
::
kFLOAT
:
*
cudnn_dtype
=
CUDNN_DATA_FLOAT
;
break
;
case
nvinfer1
::
DataType
::
kHALF
:
*
cudnn_dtype
=
CUDNN_DATA_HALF
;
break
;
default:
return
CUDNN_STATUS_BAD_PARAM
;
}
return
CUDNN_STATUS_SUCCESS
;
}
template
<
>
cublasStatus_t
cublasGemmWrap
<
float
>
(
cublasHandle_t
handle
,
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
const
float
*
beta
,
float
*
C
,
int
ldc
)
{
return
cublasSgemm
(
handle
,
transa
,
transb
,
m
,
n
,
k
,
alpha
,
A
,
lda
,
B
,
ldb
,
beta
,
C
,
ldc
);
}
template
<
>
cublasStatus_t
cublasGemmWrap
<
half
>
(
cublasHandle_t
handle
,
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
m
,
int
n
,
int
k
,
const
half
*
alpha
,
const
half
*
A
,
int
lda
,
const
half
*
B
,
int
ldb
,
const
half
*
beta
,
half
*
C
,
int
ldc
)
{
return
cublasHgemm
(
handle
,
transa
,
transb
,
m
,
n
,
k
,
alpha
,
A
,
lda
,
B
,
ldb
,
beta
,
C
,
ldc
);
}
csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv.cpp
0 → 100644
View file @
546b4279
// Copyright (c) OpenMMLab. All rights reserved
#include "trt_deform_conv.hpp"
#include <assert.h>
#include <chrono>
#include "trt_deform_conv_kernel.hpp"
#include "trt_serialize.hpp"
using
namespace
nvinfer1
;
namespace
mmdeploy
{
namespace
{
static
const
char
*
PLUGIN_VERSION
{
"1"
};
static
const
char
*
PLUGIN_NAME
{
"MMCVDeformConv2d"
};
}
// namespace
DeformableConvPluginDynamic
::
DeformableConvPluginDynamic
(
const
std
::
string
&
name
,
const
nvinfer1
::
Dims
stride
,
const
nvinfer1
::
Dims
padding
,
const
nvinfer1
::
Dims
dilation
,
const
int
deformableGroup
,
const
int
group
)
:
TRTPluginBase
(
name
),
mStride
(
stride
),
mPadding
(
padding
),
mDilation
(
dilation
),
mDeformableGroup
(
deformableGroup
),
mGroup
(
group
)
{}
DeformableConvPluginDynamic
::
DeformableConvPluginDynamic
(
const
std
::
string
name
,
const
void
*
data
,
size_t
length
)
:
TRTPluginBase
(
name
)
{
deserialize_value
(
&
data
,
&
length
,
&
mStride
);
deserialize_value
(
&
data
,
&
length
,
&
mPadding
);
deserialize_value
(
&
data
,
&
length
,
&
mDilation
);
deserialize_value
(
&
data
,
&
length
,
&
mDeformableGroup
);
deserialize_value
(
&
data
,
&
length
,
&
mGroup
);
}
DeformableConvPluginDynamic
::~
DeformableConvPluginDynamic
()
{}
nvinfer1
::
IPluginV2DynamicExt
*
DeformableConvPluginDynamic
::
clone
()
const
TRT_NOEXCEPT
{
DeformableConvPluginDynamic
*
plugin
=
new
DeformableConvPluginDynamic
(
mLayerName
,
mStride
,
mPadding
,
mDilation
,
mDeformableGroup
,
mGroup
);
plugin
->
setPluginNamespace
(
getPluginNamespace
());
return
plugin
;
}
nvinfer1
::
DimsExprs
DeformableConvPluginDynamic
::
getOutputDimensions
(
int
outputIndex
,
const
nvinfer1
::
DimsExprs
*
inputs
,
int
nbInputs
,
nvinfer1
::
IExprBuilder
&
exprBuilder
)
TRT_NOEXCEPT
{
// input[0] == input
// input[1] == offset
// input[2] == weight
nvinfer1
::
DimsExprs
ret
;
ret
.
nbDims
=
4
;
ret
.
d
[
0
]
=
inputs
[
0
].
d
[
0
];
ret
.
d
[
1
]
=
inputs
[
2
].
d
[
0
];
ret
.
d
[
2
]
=
inputs
[
1
].
d
[
2
];
ret
.
d
[
3
]
=
inputs
[
1
].
d
[
3
];
return
ret
;
}
bool
DeformableConvPluginDynamic
::
supportsFormatCombination
(
int
pos
,
const
nvinfer1
::
PluginTensorDesc
*
ioDesc
,
int
nbInputs
,
int
nbOutputs
)
TRT_NOEXCEPT
{
if
(
pos
==
0
)
{
return
((
ioDesc
[
pos
].
type
==
nvinfer1
::
DataType
::
kFLOAT
||
ioDesc
[
pos
].
type
==
nvinfer1
::
DataType
::
kHALF
)
&&
ioDesc
[
pos
].
format
==
nvinfer1
::
TensorFormat
::
kLINEAR
);
}
else
{
return
ioDesc
[
pos
].
type
==
ioDesc
[
0
].
type
&&
ioDesc
[
pos
].
format
==
ioDesc
[
0
].
format
;
}
}
void
DeformableConvPluginDynamic
::
configurePlugin
(
const
nvinfer1
::
DynamicPluginTensorDesc
*
inputs
,
int
nbInputs
,
const
nvinfer1
::
DynamicPluginTensorDesc
*
outputs
,
int
nbOutputs
)
TRT_NOEXCEPT
{}
size_t
DeformableConvPluginDynamic
::
getWorkspaceSize
(
const
nvinfer1
::
PluginTensorDesc
*
inputs
,
int
nbInputs
,
const
nvinfer1
::
PluginTensorDesc
*
outputs
,
int
nbOutputs
)
const
TRT_NOEXCEPT
{
int
sizeof_dtype
=
mmdeploy
::
getElementSize
(
outputs
[
0
].
type
);
int
batch_size
=
inputs
[
0
].
dims
.
d
[
0
];
int
nInputPlane
=
inputs
[
0
].
dims
.
d
[
1
];
int
inputHeight
=
inputs
[
0
].
dims
.
d
[
2
];
int
inputWidth
=
inputs
[
0
].
dims
.
d
[
3
];
int
nOutputPlane
=
outputs
[
0
].
dims
.
d
[
1
];
int
outputHeight
=
outputs
[
0
].
dims
.
d
[
2
];
int
outputWidth
=
outputs
[
0
].
dims
.
d
[
3
];
int
kW
=
inputs
[
2
].
dims
.
d
[
2
];
int
kH
=
inputs
[
2
].
dims
.
d
[
3
];
int
im2col_step
=
std
::
min
(
32
,
batch_size
);
size_t
col_size
=
mmdeploy
::
getAlignedSize
(
nInputPlane
*
kW
*
kH
*
im2col_step
*
outputHeight
*
outputWidth
*
sizeof_dtype
);
size_t
out_size
=
0
;
if
(
im2col_step
!=
1
)
out_size
=
mmdeploy
::
getAlignedSize
(
batch_size
*
nOutputPlane
*
outputHeight
*
outputWidth
*
sizeof_dtype
);
return
col_size
+
out_size
;
}
int
DeformableConvPluginDynamic
::
enqueue
(
const
nvinfer1
::
PluginTensorDesc
*
inputDesc
,
const
nvinfer1
::
PluginTensorDesc
*
outputDesc
,
const
void
*
const
*
inputs
,
void
*
const
*
outputs
,
void
*
workSpace
,
cudaStream_t
stream
)
TRT_NOEXCEPT
{
int
batch
=
inputDesc
[
0
].
dims
.
d
[
0
];
int
channels
=
inputDesc
[
0
].
dims
.
d
[
1
];
int
height
=
inputDesc
[
0
].
dims
.
d
[
2
];
int
width
=
inputDesc
[
0
].
dims
.
d
[
3
];
int
channels_out
=
outputDesc
[
0
].
dims
.
d
[
1
];
int
kernel_h
=
inputDesc
[
2
].
dims
.
d
[
2
];
int
kernel_w
=
inputDesc
[
2
].
dims
.
d
[
3
];
const
void
*
x
=
inputs
[
0
];
const
void
*
offset
=
inputs
[
1
];
const
void
*
weight
=
inputs
[
2
];
void
*
output
=
outputs
[
0
];
int
im2col_step
=
std
::
min
(
batch
,
32
);
auto
data_type
=
inputDesc
[
0
].
type
;
switch
(
data_type
)
{
case
nvinfer1
::
DataType
::
kFLOAT
:
deform_conv
<
float
>
((
float
*
)
x
,
(
float
*
)
weight
,
(
float
*
)
offset
,
(
float
*
)
output
,
workSpace
,
batch
,
channels
,
height
,
width
,
channels_out
,
kernel_w
,
kernel_h
,
mStride
.
d
[
0
],
mStride
.
d
[
1
],
mPadding
.
d
[
0
],
mPadding
.
d
[
1
],
mDilation
.
d
[
0
],
mDilation
.
d
[
1
],
mGroup
,
mDeformableGroup
,
im2col_step
,
m_cublas_handle
,
stream
);
break
;
case
nvinfer1
::
DataType
::
kHALF
:
deform_conv
<
half
>
((
half
*
)
x
,
(
half
*
)
weight
,
(
half
*
)
offset
,
(
half
*
)
output
,
workSpace
,
batch
,
channels
,
height
,
width
,
channels_out
,
kernel_w
,
kernel_h
,
mStride
.
d
[
0
],
mStride
.
d
[
1
],
mPadding
.
d
[
0
],
mPadding
.
d
[
1
],
mDilation
.
d
[
0
],
mDilation
.
d
[
1
],
mGroup
,
mDeformableGroup
,
im2col_step
,
m_cublas_handle
,
stream
);
break
;
default:
return
1
;
}
return
0
;
}
nvinfer1
::
DataType
DeformableConvPluginDynamic
::
getOutputDataType
(
int
index
,
const
nvinfer1
::
DataType
*
inputTypes
,
int
nbInputs
)
const
TRT_NOEXCEPT
{
return
inputTypes
[
0
];
}
// IPluginV2 Methods
const
char
*
DeformableConvPluginDynamic
::
getPluginType
()
const
TRT_NOEXCEPT
{
return
PLUGIN_NAME
;
}
const
char
*
DeformableConvPluginDynamic
::
getPluginVersion
()
const
TRT_NOEXCEPT
{
return
PLUGIN_VERSION
;
}
int
DeformableConvPluginDynamic
::
getNbOutputs
()
const
TRT_NOEXCEPT
{
return
1
;
}
size_t
DeformableConvPluginDynamic
::
getSerializationSize
()
const
TRT_NOEXCEPT
{
return
serialized_size
(
mStride
)
+
serialized_size
(
mPadding
)
+
serialized_size
(
mDilation
)
+
serialized_size
(
mDeformableGroup
)
+
serialized_size
(
mGroup
);
}
void
DeformableConvPluginDynamic
::
serialize
(
void
*
buffer
)
const
TRT_NOEXCEPT
{
serialize_value
(
&
buffer
,
mStride
);
serialize_value
(
&
buffer
,
mPadding
);
serialize_value
(
&
buffer
,
mDilation
);
serialize_value
(
&
buffer
,
mDeformableGroup
);
serialize_value
(
&
buffer
,
mGroup
);
}
void
DeformableConvPluginDynamic
::
attachToContext
(
cudnnContext
*
cudnnContext
,
cublasContext
*
cublasContext
,
nvinfer1
::
IGpuAllocator
*
gpuAllocator
)
TRT_NOEXCEPT
{
m_cublas_handle
=
cublasContext
;
}
void
DeformableConvPluginDynamic
::
detachFromContext
()
TRT_NOEXCEPT
{}
////////////////////// creator /////////////////////////////
DeformableConvPluginDynamicCreator
::
DeformableConvPluginDynamicCreator
()
{
mPluginAttributes
.
clear
();
mPluginAttributes
.
emplace_back
(
nvinfer1
::
PluginField
(
"stride"
));
mPluginAttributes
.
emplace_back
(
nvinfer1
::
PluginField
(
"padding"
));
mPluginAttributes
.
emplace_back
(
nvinfer1
::
PluginField
(
"dilation"
));
mPluginAttributes
.
emplace_back
(
nvinfer1
::
PluginField
(
"groups"
));
mPluginAttributes
.
emplace_back
(
nvinfer1
::
PluginField
(
"deform_groups"
));
mFC
.
nbFields
=
mPluginAttributes
.
size
();
mFC
.
fields
=
mPluginAttributes
.
data
();
}
const
char
*
DeformableConvPluginDynamicCreator
::
getPluginName
()
const
TRT_NOEXCEPT
{
return
PLUGIN_NAME
;
}
const
char
*
DeformableConvPluginDynamicCreator
::
getPluginVersion
()
const
TRT_NOEXCEPT
{
return
PLUGIN_VERSION
;
}
nvinfer1
::
IPluginV2
*
DeformableConvPluginDynamicCreator
::
createPlugin
(
const
char
*
name
,
const
nvinfer1
::
PluginFieldCollection
*
fc
)
TRT_NOEXCEPT
{
nvinfer1
::
Dims
stride
{
2
,
{
1
,
1
}};
nvinfer1
::
Dims
padding
{
2
,
{
0
,
0
}};
nvinfer1
::
Dims
dilation
{
2
,
{
1
,
1
}};
int
deformableGroup
=
1
;
int
group
=
1
;
for
(
int
i
=
0
;
i
<
fc
->
nbFields
;
i
++
)
{
if
(
fc
->
fields
[
i
].
data
==
nullptr
)
{
continue
;
}
std
::
string
field_name
(
fc
->
fields
[
i
].
name
);
if
(
field_name
.
compare
(
"deform_groups"
)
==
0
)
{
deformableGroup
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
0
];
}
if
(
field_name
.
compare
(
"groups"
)
==
0
)
{
group
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
0
];
}
if
(
field_name
.
compare
(
"stride"
)
==
0
)
{
stride
.
nbDims
=
2
;
stride
.
d
[
0
]
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
0
];
stride
.
d
[
1
]
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
1
];
}
if
(
field_name
.
compare
(
"padding"
)
==
0
)
{
padding
.
nbDims
=
2
;
padding
.
d
[
0
]
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
0
];
padding
.
d
[
1
]
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
1
];
}
if
(
field_name
.
compare
(
"dilation"
)
==
0
)
{
dilation
.
nbDims
=
2
;
dilation
.
d
[
0
]
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
0
];
dilation
.
d
[
1
]
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
1
];
}
}
DeformableConvPluginDynamic
*
plugin
=
new
DeformableConvPluginDynamic
(
name
,
stride
,
padding
,
dilation
,
deformableGroup
,
group
);
plugin
->
setPluginNamespace
(
getPluginNamespace
());
return
plugin
;
}
nvinfer1
::
IPluginV2
*
DeformableConvPluginDynamicCreator
::
deserializePlugin
(
const
char
*
name
,
const
void
*
serialData
,
size_t
serialLength
)
TRT_NOEXCEPT
{
auto
plugin
=
new
DeformableConvPluginDynamic
(
name
,
serialData
,
serialLength
);
plugin
->
setPluginNamespace
(
getPluginNamespace
());
return
plugin
;
}
REGISTER_TENSORRT_PLUGIN
(
DeformableConvPluginDynamicCreator
);
}
// namespace mmdeploy
csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv.hpp
0 → 100644
View file @
546b4279
// Copyright (c) OpenMMLab. All rights reserved.
#ifndef TRT_DEFORM_CONV_HPP
#define TRT_DEFORM_CONV_HPP
#include <cublas_v2.h>
#include <memory>
#include <string>
#include <vector>
#include "trt_plugin_base.hpp"
namespace
mmdeploy
{
class
DeformableConvPluginDynamic
:
public
TRTPluginBase
{
public:
DeformableConvPluginDynamic
(
const
std
::
string
&
name
,
const
nvinfer1
::
Dims
stride
,
const
nvinfer1
::
Dims
padding
,
const
nvinfer1
::
Dims
dilation
,
const
int
deformableGroup
,
const
int
group
);
DeformableConvPluginDynamic
(
const
std
::
string
name
,
const
void
*
data
,
size_t
length
);
DeformableConvPluginDynamic
()
=
delete
;
~
DeformableConvPluginDynamic
()
TRT_NOEXCEPT
override
;
// IPluginV2DynamicExt Methods
nvinfer1
::
IPluginV2DynamicExt
*
clone
()
const
TRT_NOEXCEPT
override
;
nvinfer1
::
DimsExprs
getOutputDimensions
(
int
outputIndex
,
const
nvinfer1
::
DimsExprs
*
inputs
,
int
nbInputs
,
nvinfer1
::
IExprBuilder
&
exprBuilder
)
TRT_NOEXCEPT
override
;
bool
supportsFormatCombination
(
int
pos
,
const
nvinfer1
::
PluginTensorDesc
*
ioDesc
,
int
nbInputs
,
int
nbOutputs
)
TRT_NOEXCEPT
override
;
void
configurePlugin
(
const
nvinfer1
::
DynamicPluginTensorDesc
*
in
,
int
nbInputs
,
const
nvinfer1
::
DynamicPluginTensorDesc
*
out
,
int
nbOutputs
)
TRT_NOEXCEPT
override
;
size_t
getWorkspaceSize
(
const
nvinfer1
::
PluginTensorDesc
*
inputs
,
int
nbInputs
,
const
nvinfer1
::
PluginTensorDesc
*
outputs
,
int
nbOutputs
)
const
TRT_NOEXCEPT
override
;
int
enqueue
(
const
nvinfer1
::
PluginTensorDesc
*
inputDesc
,
const
nvinfer1
::
PluginTensorDesc
*
outputDesc
,
const
void
*
const
*
inputs
,
void
*
const
*
outputs
,
void
*
workspace
,
cudaStream_t
stream
)
TRT_NOEXCEPT
override
;
void
attachToContext
(
cudnnContext
*
cudnnContext
,
cublasContext
*
cublasContext
,
nvinfer1
::
IGpuAllocator
*
gpuAllocator
)
TRT_NOEXCEPT
override
;
void
detachFromContext
()
TRT_NOEXCEPT
override
;
// IPluginV2Ext Methods
nvinfer1
::
DataType
getOutputDataType
(
int
index
,
const
nvinfer1
::
DataType
*
inputTypes
,
int
nbInputs
)
const
TRT_NOEXCEPT
override
;
// IPluginV2 Methods
const
char
*
getPluginType
()
const
TRT_NOEXCEPT
override
;
const
char
*
getPluginVersion
()
const
TRT_NOEXCEPT
override
;
int
getNbOutputs
()
const
TRT_NOEXCEPT
override
;
size_t
getSerializationSize
()
const
TRT_NOEXCEPT
override
;
void
serialize
(
void
*
buffer
)
const
TRT_NOEXCEPT
override
;
private:
nvinfer1
::
Dims
mStride
;
nvinfer1
::
Dims
mPadding
;
nvinfer1
::
Dims
mDilation
;
int
mDeformableGroup
;
int
mGroup
;
cublasHandle_t
m_cublas_handle
;
};
class
DeformableConvPluginDynamicCreator
:
public
TRTPluginCreatorBase
{
public:
DeformableConvPluginDynamicCreator
();
const
char
*
getPluginName
()
const
TRT_NOEXCEPT
override
;
const
char
*
getPluginVersion
()
const
TRT_NOEXCEPT
override
;
nvinfer1
::
IPluginV2
*
createPlugin
(
const
char
*
name
,
const
nvinfer1
::
PluginFieldCollection
*
fc
)
TRT_NOEXCEPT
override
;
nvinfer1
::
IPluginV2
*
deserializePlugin
(
const
char
*
name
,
const
void
*
serialData
,
size_t
serialLength
)
TRT_NOEXCEPT
override
;
};
}
// namespace mmdeploy
#endif // TRT_DEFORM_CONV_HPP
csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv_kernel.cu
0 → 100644
View file @
546b4279
/*!
******************* BEGIN Caffe Copyright Notice and Disclaimer
*****************
*
* COPYRIGHT
*
* All contributions by the University of California:
* Copyright (c) 2014-2017 The Regents of the University of California (Regents)
* All rights reserved.
*
* All other contributions:
* Copyright (c) 2014-2017, the respective contributors
* All rights reserved.
*
* Caffe uses a shared copyright model: each contributor holds copyright over
* their contributions to Caffe. The project versioning records all such
* contribution and copyright details. If a contributor wants to further mark
* their specific copyright on a particular contribution, they should indicate
* their copyright solely in the commit message of the change when it is
* committed.
*
* LICENSE
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
*this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
*AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
*IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
*FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
*DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
*SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
*CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
*OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
*OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* CONTRIBUTION AGREEMENT
*
* By contributing to the BVLC/caffe repository through pull-request, comment,
* or otherwise, the contributor releases their content to the
* license and copyright terms herein.
*
***************** END Caffe Copyright Notice and Disclaimer
*********************
*
* Copyright (c) 2018 Microsoft
* Licensed under The MIT License [see LICENSE for details]
* \file modulated_deformable_im2col.cuh
* \brief Function definitions of converting an image to
* column matrix based on kernel, padding, dilation, and offset.
* These functions are mainly used in deformable convolution operators.
* \ref: https://arxiv.org/abs/1703.06211
* \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng
*/
// modified from
// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu
#include "common_cuda_helper.hpp"
#include "trt_deform_conv_kernel.cuh"
#include "trt_deform_conv_kernel.hpp"
#include "trt_plugin_helper.hpp"
template
<
typename
scalar_t
>
void
deform_conv_im2col
(
const
scalar_t
*
input
,
const
scalar_t
*
offset
,
scalar_t
*
column
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
ksize_h
,
const
int
ksize_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
parallel_imgs
,
const
int
deformable_group
,
cudaStream_t
stream
)
{
int
height_col
=
(
height
+
2
*
pad_h
-
(
dilation_h
*
(
ksize_h
-
1
)
+
1
))
/
stride_h
+
1
;
int
width_col
=
(
width
+
2
*
pad_w
-
(
dilation_w
*
(
ksize_w
-
1
)
+
1
))
/
stride_w
+
1
;
int
num_kernels
=
channels
*
height_col
*
width_col
*
parallel_imgs
;
int
channel_per_deformable_group
=
channels
/
deformable_group
;
deformable_im2col_gpu_kernel
<
scalar_t
><<<
GET_BLOCKS
(
num_kernels
),
THREADS_PER_BLOCK
,
0
,
stream
>>>
(
num_kernels
,
input
,
offset
,
height
,
width
,
ksize_h
,
ksize_w
,
pad_h
,
pad_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
channel_per_deformable_group
,
parallel_imgs
,
channels
,
deformable_group
,
height_col
,
width_col
,
column
);
cudaCheckError
();
}
template
<
typename
scalar_t
>
void
deform_conv
(
const
scalar_t
*
input
,
const
scalar_t
*
weight
,
const
scalar_t
*
offset
,
scalar_t
*
output
,
void
*
workspace
,
int
batchSize
,
int
nInputPlane
,
int
inputHeight
,
int
inputWidth
,
int
nOutputPlane
,
int
kW
,
int
kH
,
int
dW
,
int
dH
,
int
padW
,
int
padH
,
int
dilationW
,
int
dilationH
,
int
group
,
int
deformable_group
,
int
im2col_step
,
cublasHandle_t
cublas_handle
,
cudaStream_t
stream
)
{
size_t
word_size
=
sizeof
(
scalar_t
);
im2col_step
=
std
::
min
(
int
(
batchSize
),
im2col_step
);
long
outputWidth
=
(
inputWidth
+
2
*
padW
-
(
dilationW
*
(
kW
-
1
)
+
1
))
/
dW
+
1
;
long
outputHeight
=
(
inputHeight
+
2
*
padH
-
(
dilationH
*
(
kH
-
1
)
+
1
))
/
dH
+
1
;
long
outputHW
=
outputHeight
*
outputWidth
;
long
kHW
=
kH
*
kW
;
long
columns_size
=
mmdeploy
::
getAlignedSize
(
nInputPlane
*
kHW
*
im2col_step
*
outputHW
*
word_size
);
// column buffer for img2col
char
*
workspace_ptr
=
reinterpret_cast
<
char
*>
(
workspace
);
scalar_t
*
columns
=
reinterpret_cast
<
scalar_t
*>
(
workspace_ptr
);
workspace_ptr
=
workspace_ptr
+
columns_size
;
scalar_t
*
output_buffer
;
if
(
im2col_step
==
1
)
{
output_buffer
=
output
;
}
else
{
// output need permute when im2col_step!=1
output_buffer
=
reinterpret_cast
<
scalar_t
*>
(
workspace_ptr
);
}
long
input_elt_step
=
im2col_step
*
nInputPlane
*
inputHeight
*
inputWidth
;
long
offset_elt_step
=
im2col_step
*
deformable_group
*
2
*
kHW
*
outputHW
;
long
out_buffer_step
=
nOutputPlane
*
im2col_step
*
outputHW
;
long
col_g_step
=
nInputPlane
*
kHW
*
im2col_step
*
outputHW
/
group
;
long
weight_g_step
=
nOutputPlane
*
nInputPlane
*
kHW
/
(
group
*
group
);
long
out_buffer_g_step
=
out_buffer_step
/
group
;
int
m
=
nOutputPlane
/
group
;
int
n
=
im2col_step
*
outputHW
;
int
k
=
nInputPlane
*
kHW
/
group
;
scalar_t
alpha
=
1.
f
;
scalar_t
beta
=
0.
f
;
for
(
int
elt
=
0
;
elt
<
batchSize
/
im2col_step
;
elt
++
)
{
const
scalar_t
*
input_start
=
input
+
elt
*
input_elt_step
;
const
scalar_t
*
offset_start
=
offset
+
elt
*
offset_elt_step
;
deform_conv_im2col
<
scalar_t
>
(
input_start
,
offset_start
,
columns
,
nInputPlane
,
inputHeight
,
inputWidth
,
kH
,
kW
,
padH
,
padW
,
dH
,
dW
,
dilationH
,
dilationW
,
im2col_step
,
deformable_group
,
stream
);
for
(
int
g
=
0
;
g
<
group
;
++
g
)
{
const
scalar_t
*
weight_start
=
weight
+
g
*
weight_g_step
;
scalar_t
*
col_start
=
columns
+
g
*
col_g_step
;
scalar_t
*
out_buffer_start
=
output_buffer
+
elt
*
out_buffer_step
+
g
*
out_buffer_g_step
;
cublasGemmWrap
<
scalar_t
>
(
cublas_handle
,
CUBLAS_OP_N
,
CUBLAS_OP_N
,
n
,
m
,
k
,
&
alpha
,
col_start
,
n
,
weight_start
,
k
,
&
beta
,
out_buffer_start
,
n
);
cudaCheckError
();
}
}
if
(
im2col_step
!=
1
)
{
int
output_buffer_shape
[
5
]
=
{
batchSize
/
im2col_step
,
nOutputPlane
,
im2col_step
,
static_cast
<
int
>
(
outputHeight
),
static_cast
<
int
>
(
outputWidth
)};
int
output_buffer_permute
[
5
]
=
{
0
,
2
,
1
,
3
,
4
};
memcpyPermute
<
scalar_t
>
(
output
,
output_buffer
,
&
output_buffer_shape
[
0
],
&
output_buffer_permute
[
0
],
5
,
stream
);
}
}
template
void
deform_conv
<
float
>(
const
float
*
input
,
const
float
*
weight
,
const
float
*
offset
,
float
*
output
,
void
*
workspace
,
int
batchSize
,
int
nInputPlane
,
int
inputHeight
,
int
inputWidth
,
int
nOutputPlane
,
int
kW
,
int
kH
,
int
dW
,
int
dH
,
int
padW
,
int
padH
,
int
dilationW
,
int
dilationH
,
int
group
,
int
deformable_group
,
int
im2col_step
,
cublasHandle_t
cublas_handle
,
cudaStream_t
stream
);
template
void
deform_conv
<
__half
>(
const
__half
*
input
,
const
__half
*
weight
,
const
__half
*
offset
,
__half
*
output
,
void
*
workspace
,
int
batchSize
,
int
nInputPlane
,
int
inputHeight
,
int
inputWidth
,
int
nOutputPlane
,
int
kW
,
int
kH
,
int
dW
,
int
dH
,
int
padW
,
int
padH
,
int
dilationW
,
int
dilationH
,
int
group
,
int
deformable_group
,
int
im2col_step
,
cublasHandle_t
cublas_handle
,
cudaStream_t
stream
);
csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv_kernel.cuh
0 → 100644
View file @
546b4279
/*!
******************* BEGIN Caffe Copyright Notice and Disclaimer
*****************
*
* COPYRIGHT
*
* All contributions by the University of California:
* Copyright (c) 2014-2017 The Regents of the University of California (Regents)
* All rights reserved.
*
* All other contributions:
* Copyright (c) 2014-2017, the respective contributors
* All rights reserved.
*
* Caffe uses a shared copyright model: each contributor holds copyright over
* their contributions to Caffe. The project versioning records all such
* contribution and copyright details. If a contributor wants to further mark
* their specific copyright on a particular contribution, they should indicate
* their copyright solely in the commit message of the change when it is
* committed.
*
* LICENSE
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
*this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
*AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
*IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
*FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
*DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
*SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
*CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
*OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
*OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* CONTRIBUTION AGREEMENT
*
* By contributing to the BVLC/caffe repository through pull-request, comment,
* or otherwise, the contributor releases their content to the
* license and copyright terms herein.
*
***************** END Caffe Copyright Notice and Disclaimer
*********************
*
* Copyright (c) 2018 Microsoft
* Licensed under The MIT License [see LICENSE for details]
* \file modulated_deformable_im2col.cuh
* \brief Function definitions of converting an image to
* column matrix based on kernel, padding, dilation, and offset.
* These functions are mainly used in deformable convolution operators.
* \ref: https://arxiv.org/abs/1703.06211
* \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng
*/
// modified from
// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu
#include <cuda_fp16.h>
#include "common_cuda_helper.hpp"
template
<
typename
scalar_t
>
__device__
__forceinline__
scalar_t
deformable_im2col_bilinear
(
const
scalar_t
*
__restrict__
input
,
const
int
height
,
const
int
width
,
float
h
,
float
w
)
{
if
(
h
<=
-
1
||
height
<=
h
||
w
<=
-
1
||
width
<=
w
)
{
return
0
;
}
const
int
h_low
=
floorf
(
h
);
const
int
w_low
=
floorf
(
w
);
input
+=
h_low
*
width
;
const
scalar_t
v1
=
(
h_low
>=
0
&&
w_low
>=
0
)
?
input
[
w_low
]
:
static_cast
<
scalar_t
>
(
0.0
f
);
const
int
w_high
=
w_low
+
1
;
const
scalar_t
v2
=
(
h_low
>=
0
&&
w_high
<=
width
-
1
)
?
input
[
w_high
]
:
static_cast
<
scalar_t
>
(
0.0
f
);
const
scalar_t
lw
=
w
-
w_low
;
const
scalar_t
v_low
=
fmaf
(
v2
-
v1
,
lw
,
v1
);
input
+=
width
;
const
scalar_t
v3
=
(
h_low
<=
height
-
2
&&
w_low
>=
0
)
?
input
[
w_low
]
:
static_cast
<
scalar_t
>
(
0.0
f
);
const
scalar_t
v4
=
(
h_low
<=
height
-
2
&&
w_high
<=
width
-
1
)
?
input
[
w_high
]
:
static_cast
<
scalar_t
>
(
0.0
f
);
const
scalar_t
v_high
=
fmaf
(
v4
-
v3
,
lw
,
v3
);
const
scalar_t
lh
=
h
-
h_low
;
const
scalar_t
val
=
fmaf
(
v_high
-
v_low
,
lh
,
v_low
);
return
val
;
}
template
<
>
__device__
__forceinline__
__half
deformable_im2col_bilinear
(
const
__half
*
__restrict__
input
,
const
int
height
,
const
int
width
,
float
h
,
float
w
)
{
if
(
h
<=
-
1
||
height
<=
h
||
w
<=
-
1
||
width
<=
w
)
{
return
0
;
}
const
int
h_low
=
floorf
(
h
);
const
int
w_low
=
floorf
(
w
);
input
+=
h_low
*
width
;
const
float
v1
=
(
h_low
>=
0
&&
w_low
>=
0
)
?
__half2float
(
input
[
w_low
])
:
0.0
f
;
const
int
w_high
=
w_low
+
1
;
const
float
v2
=
(
h_low
>=
0
&&
w_high
<=
width
-
1
)
?
__half2float
(
input
[
w_high
])
:
0.0
f
;
const
float
lw
=
w
-
w_low
;
const
float
v_low
=
fmaf
(
v2
-
v1
,
lw
,
v1
);
input
+=
width
;
const
float
v3
=
(
h_low
<=
height
-
2
&&
w_low
>=
0
)
?
__half2float
(
input
[
w_low
])
:
0.0
f
;
const
float
v4
=
(
h_low
<=
height
-
2
&&
w_high
<=
width
-
1
)
?
__half2float
(
input
[
w_high
])
:
0.0
f
;
const
float
v_high
=
fmaf
(
v4
-
v3
,
lw
,
v3
);
const
float
lh
=
h
-
h_low
;
const
float
val
=
fmaf
(
v_high
-
v_low
,
lh
,
v_low
);
return
__float2half
(
val
);
}
template
<
typename
scalar_t
>
__global__
void
deformable_im2col_gpu_kernel
(
const
int
n
,
const
scalar_t
*
__restrict__
data_im
,
const
scalar_t
*
__restrict__
data_offset
,
const
int
height
,
const
int
width
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
channel_per_deformable_group
,
const
int
batch_size
,
const
int
num_channels
,
const
int
deformable_group
,
const
int
height_col
,
const
int
width_col
,
scalar_t
*
__restrict__
data_col
)
{
const
int
hw_col
=
height_col
*
width_col
;
const
int
data_col_step
=
batch_size
*
hw_col
;
CUDA_1D_KERNEL_LOOP
(
index
,
n
)
{
// index index of output matrix
int
tmp_index
=
index
;
const
int
w_col
=
tmp_index
%
width_col
;
tmp_index
/=
width_col
;
const
int
h_col
=
tmp_index
%
height_col
;
tmp_index
/=
height_col
;
const
int
b_col
=
tmp_index
%
batch_size
;
const
int
c_im
=
tmp_index
/
batch_size
;
const
int
c_col
=
c_im
*
kernel_h
*
kernel_w
;
// compute deformable group index
const
int
deformable_group_index
=
c_im
/
channel_per_deformable_group
;
const
int
h_in
=
h_col
*
stride_h
-
pad_h
;
const
int
w_in
=
w_col
*
stride_w
-
pad_w
;
scalar_t
*
__restrict__
data_col_ptr
=
data_col
+
c_col
*
data_col_step
+
index
%
data_col_step
;
const
scalar_t
*
__restrict__
data_im_ptr
=
data_im
+
(
b_col
*
num_channels
+
c_im
)
*
height
*
width
;
const
scalar_t
*
__restrict__
data_offset_ptr
=
data_offset
+
((
b_col
*
deformable_group
+
deformable_group_index
)
<<
1
)
*
kernel_h
*
kernel_w
*
hw_col
+
h_col
*
width_col
+
w_col
;
for
(
int
i
=
0
;
i
<
kernel_h
;
++
i
)
{
for
(
int
j
=
0
;
j
<
kernel_w
;
++
j
)
{
const
int
data_offset_h
=
(
i
*
kernel_w
+
j
)
*
hw_col
<<
1
;
const
scalar_t
offset_h
=
data_offset_ptr
[
data_offset_h
];
const
int
data_offset_w
=
data_offset_h
+
hw_col
;
const
scalar_t
offset_w
=
data_offset_ptr
[
data_offset_w
];
const
scalar_t
h_im
=
h_in
+
i
*
dilation_h
+
(
float
)
offset_h
;
const
scalar_t
w_im
=
w_in
+
j
*
dilation_w
+
(
float
)
offset_w
;
const
scalar_t
val
=
deformable_im2col_bilinear
(
data_im_ptr
,
height
,
width
,
h_im
,
w_im
);
*
data_col_ptr
=
val
;
data_col_ptr
+=
data_col_step
;
}
}
}
}
csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv_kernel.hpp
0 → 100644
View file @
546b4279
// Copyright (c) OpenMMLab. All rights reserved
#ifndef TRT_DEFORM_CONV_KERNEL_HPP
#define TRT_DEFORM_CONV_KERNEL_HPP
#include <cublas_v2.h>
#include <cuda_runtime.h>
template
<
typename
scalar_t
>
void
deform_conv_im2col
(
const
scalar_t
*
input
,
const
scalar_t
*
offset
,
scalar_t
*
column
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
ksize_h
,
const
int
ksize_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
parallel_imgs
,
const
int
deformable_group
,
cudaStream_t
stream
);
template
<
typename
scalar_t
>
void
deform_conv
(
const
scalar_t
*
input
,
const
scalar_t
*
weight
,
const
scalar_t
*
offset
,
scalar_t
*
output
,
void
*
workspace
,
int
batchSize
,
int
nInputPlane
,
int
inputHeight
,
int
inputWidth
,
int
nOutputPlane
,
int
kW
,
int
kH
,
int
dW
,
int
dH
,
int
padW
,
int
padH
,
int
dilationW
,
int
dilationH
,
int
group
,
int
deformable_group
,
int
im2col_step
,
cublasHandle_t
cublas_handle
,
cudaStream_t
stream
);
#endif // TRT_DEFORM_CONV_KERNEL_HPP
csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk.cpp
0 → 100644
View file @
546b4279
// Copyright (c) OpenMMLab. All rights reserved.
#include "gather_topk.hpp"
#include <assert.h>
#include <stdio.h>
#include <chrono>
#include "NvInferVersion.h"
#include "gather_topk_kernel.hpp"
#include "trt_serialize.hpp"
namespace
mmdeploy
{
namespace
{
static
const
char
*
PLUGIN_VERSION
{
"1"
};
static
const
char
*
PLUGIN_NAME
{
"GatherTopk"
};
}
// namespace
GatherTopk
::
GatherTopk
(
const
std
::
string
&
name
)
:
TRTPluginBase
(
name
)
{}
GatherTopk
::
GatherTopk
(
const
std
::
string
name
,
const
void
*
data
,
size_t
length
)
:
TRTPluginBase
(
name
)
{}
nvinfer1
::
IPluginV2DynamicExt
*
GatherTopk
::
clone
()
const
TRT_NOEXCEPT
{
GatherTopk
*
plugin
=
new
GatherTopk
(
mLayerName
);
plugin
->
setPluginNamespace
(
getPluginNamespace
());
return
plugin
;
}
nvinfer1
::
DimsExprs
GatherTopk
::
getOutputDimensions
(
int
outputIndex
,
const
nvinfer1
::
DimsExprs
*
inputs
,
int
nbInputs
,
nvinfer1
::
IExprBuilder
&
exprBuilder
)
TRT_NOEXCEPT
{
assert
(
inputs
[
0
].
nbDims
>=
inputs
[
1
].
nbDims
);
nvinfer1
::
DimsExprs
ret
;
ret
.
nbDims
=
inputs
[
0
].
nbDims
;
for
(
int
i
=
0
;
i
<
inputs
[
1
].
nbDims
;
++
i
)
{
ret
.
d
[
i
]
=
inputs
[
1
].
d
[
i
];
}
for
(
int
i
=
inputs
[
1
].
nbDims
;
i
<
inputs
[
0
].
nbDims
;
++
i
)
{
ret
.
d
[
i
]
=
inputs
[
0
].
d
[
i
];
}
return
ret
;
}
bool
GatherTopk
::
supportsFormatCombination
(
int
pos
,
const
nvinfer1
::
PluginTensorDesc
*
ioDesc
,
int
nbInputs
,
int
nbOutputs
)
TRT_NOEXCEPT
{
switch
(
pos
)
{
case
0
:
// data
return
(
ioDesc
[
pos
].
type
==
nvinfer1
::
DataType
::
kFLOAT
&&
ioDesc
[
pos
].
format
==
nvinfer1
::
TensorFormat
::
kLINEAR
)
||
(
ioDesc
[
pos
].
type
==
nvinfer1
::
DataType
::
kINT32
&&
ioDesc
[
pos
].
format
==
nvinfer1
::
TensorFormat
::
kLINEAR
);
case
1
:
// indices
return
ioDesc
[
pos
].
type
==
nvinfer1
::
DataType
::
kINT32
&&
ioDesc
[
pos
].
format
==
nvinfer1
::
TensorFormat
::
kLINEAR
;
case
2
:
// output
return
ioDesc
[
pos
].
type
==
ioDesc
[
0
].
type
&&
ioDesc
[
pos
].
format
==
ioDesc
[
0
].
format
;
default:
return
true
;
}
return
true
;
}
void
GatherTopk
::
configurePlugin
(
const
nvinfer1
::
DynamicPluginTensorDesc
*
inputs
,
int
nbInputs
,
const
nvinfer1
::
DynamicPluginTensorDesc
*
outputs
,
int
nbOutputs
)
TRT_NOEXCEPT
{}
size_t
GatherTopk
::
getWorkspaceSize
(
const
nvinfer1
::
PluginTensorDesc
*
inputs
,
int
nbInputs
,
const
nvinfer1
::
PluginTensorDesc
*
outputs
,
int
nbOutputs
)
const
TRT_NOEXCEPT
{
return
0
;
}
int
GatherTopk
::
enqueue
(
const
nvinfer1
::
PluginTensorDesc
*
inputDesc
,
const
nvinfer1
::
PluginTensorDesc
*
outputDesc
,
const
void
*
const
*
inputs
,
void
*
const
*
outputs
,
void
*
workSpace
,
cudaStream_t
stream
)
TRT_NOEXCEPT
{
const
int
*
dims
=
&
(
inputDesc
[
0
].
dims
.
d
[
0
]);
const
int
*
indices_dims
=
&
(
inputDesc
[
1
].
dims
.
d
[
0
]);
int
nbDims
=
inputDesc
[
0
].
dims
.
nbDims
;
int
indice_nbDims
=
inputDesc
[
1
].
dims
.
nbDims
;
const
void
*
data
=
inputs
[
0
];
const
void
*
indices
=
inputs
[
1
];
void
*
output
=
outputs
[
0
];
auto
data_type
=
inputDesc
[
0
].
type
;
switch
(
data_type
)
{
case
nvinfer1
::
DataType
::
kFLOAT
:
gather_topk_impl
<
float
>
((
float
*
)
data
,
(
int
*
)
indices
,
dims
,
nbDims
,
indices_dims
,
indice_nbDims
,
(
float
*
)
output
,
stream
);
break
;
case
nvinfer1
::
DataType
::
kINT32
:
gather_topk_impl
<
int
>
((
int
*
)
data
,
(
int
*
)
indices
,
dims
,
nbDims
,
indices_dims
,
indice_nbDims
,
(
int
*
)
output
,
stream
);
break
;
default:
break
;
}
return
0
;
}
nvinfer1
::
DataType
GatherTopk
::
getOutputDataType
(
int
index
,
const
nvinfer1
::
DataType
*
inputTypes
,
int
nbInputs
)
const
TRT_NOEXCEPT
{
return
inputTypes
[
0
];
}
// IPluginV2 Methods
const
char
*
GatherTopk
::
getPluginType
()
const
TRT_NOEXCEPT
{
return
PLUGIN_NAME
;
}
const
char
*
GatherTopk
::
getPluginVersion
()
const
TRT_NOEXCEPT
{
return
PLUGIN_VERSION
;
}
int
GatherTopk
::
getNbOutputs
()
const
TRT_NOEXCEPT
{
return
1
;
}
size_t
GatherTopk
::
getSerializationSize
()
const
TRT_NOEXCEPT
{
return
0
;
}
void
GatherTopk
::
serialize
(
void
*
buffer
)
const
TRT_NOEXCEPT
{}
GatherTopkCreator
::
GatherTopkCreator
()
{
mPluginAttributes
.
clear
();
mFC
.
nbFields
=
mPluginAttributes
.
size
();
mFC
.
fields
=
mPluginAttributes
.
data
();
}
const
char
*
GatherTopkCreator
::
getPluginName
()
const
TRT_NOEXCEPT
{
return
PLUGIN_NAME
;
}
const
char
*
GatherTopkCreator
::
getPluginVersion
()
const
TRT_NOEXCEPT
{
return
PLUGIN_VERSION
;
}
nvinfer1
::
IPluginV2
*
GatherTopkCreator
::
createPlugin
(
const
char
*
name
,
const
nvinfer1
::
PluginFieldCollection
*
fc
)
TRT_NOEXCEPT
{
auto
*
plugin
=
new
GatherTopk
(
name
);
plugin
->
setPluginNamespace
(
getPluginNamespace
());
return
plugin
;
}
nvinfer1
::
IPluginV2
*
GatherTopkCreator
::
deserializePlugin
(
const
char
*
name
,
const
void
*
serialData
,
size_t
serialLength
)
TRT_NOEXCEPT
{
auto
plugin
=
new
GatherTopk
(
name
,
serialData
,
serialLength
);
plugin
->
setPluginNamespace
(
getPluginNamespace
());
return
plugin
;
}
REGISTER_TENSORRT_PLUGIN
(
GatherTopkCreator
);
}
// namespace mmdeploy
Prev
1
…
7
8
9
10
11
12
13
14
15
…
23
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment