Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
mmdeploy
Commits
546b4279
Commit
546b4279
authored
Jun 25, 2025
by
limm
Browse files
add csrc and mmdeploy module
parent
502f4fb9
Pipeline
#2810
canceled with stages
Changes
447
Pipelines
1
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2417 additions
and
0 deletions
+2417
-0
csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk.hpp
...mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk.hpp
+64
-0
csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk_kernel.cu
...oy/backend_ops/tensorrt/gather_topk/gather_topk_kernel.cu
+46
-0
csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk_kernel.hpp
...y/backend_ops/tensorrt/gather_topk/gather_topk_kernel.hpp
+10
-0
csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors.cpp
...ploy/backend_ops/tensorrt/grid_priors/trt_grid_priors.cpp
+154
-0
csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors.hpp
...ploy/backend_ops/tensorrt/grid_priors/trt_grid_priors.hpp
+66
-0
csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors_kernel.cu
...ackend_ops/tensorrt/grid_priors/trt_grid_priors_kernel.cu
+43
-0
csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors_kernel.hpp
...ckend_ops/tensorrt/grid_priors/trt_grid_priors_kernel.hpp
+10
-0
csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler.cpp
...oy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler.cpp
+202
-0
csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler.hpp
...oy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler.hpp
+84
-0
csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler_kernel.cu
...kend_ops/tensorrt/grid_sampler/trt_grid_sampler_kernel.cu
+396
-0
csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler_kernel.hpp
...end_ops/tensorrt/grid_sampler/trt_grid_sampler_kernel.hpp
+13
-0
csrc/mmdeploy/backend_ops/tensorrt/instance_norm/trt_instance_norm.cpp
.../backend_ops/tensorrt/instance_norm/trt_instance_norm.cpp
+214
-0
csrc/mmdeploy/backend_ops/tensorrt/instance_norm/trt_instance_norm.hpp
.../backend_ops/tensorrt/instance_norm/trt_instance_norm.hpp
+93
-0
csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv.cpp
...sorrt/modulated_deform_conv/trt_modulated_deform_conv.cpp
+306
-0
csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv.hpp
...sorrt/modulated_deform_conv/trt_modulated_deform_conv.hpp
+82
-0
csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv_kernel.cu
...modulated_deform_conv/trt_modulated_deform_conv_kernel.cu
+138
-0
csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv_kernel.hpp
...odulated_deform_conv/trt_modulated_deform_conv_kernel.hpp
+15
-0
csrc/mmdeploy/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align.cpp
...sorrt/multi_level_roi_align/trt_multi_level_roi_align.cpp
+227
-0
csrc/mmdeploy/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align.hpp
...sorrt/multi_level_roi_align/trt_multi_level_roi_align.hpp
+78
-0
csrc/mmdeploy/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align_kernel.cu
...multi_level_roi_align/trt_multi_level_roi_align_kernel.cu
+176
-0
No files found.
Too many changes to show.
To preserve performance only
447 of 447+
files are displayed.
Plain diff
Email patch
csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk.hpp
0 → 100644
View file @
546b4279
// Copyright (c) OpenMMLab. All rights reserved.
#ifndef TRT_SCATTERND_HPP
#define TRT_SCATTERND_HPP
#include <cublas_v2.h>
#include <memory>
#include <string>
#include <vector>
#include "trt_plugin_base.hpp"
namespace
mmdeploy
{
class
GatherTopk
:
public
TRTPluginBase
{
public:
GatherTopk
(
const
std
::
string
&
name
);
GatherTopk
(
const
std
::
string
name
,
const
void
*
data
,
size_t
length
);
GatherTopk
()
=
delete
;
// IPluginV2DynamicExt Methods
nvinfer1
::
IPluginV2DynamicExt
*
clone
()
const
TRT_NOEXCEPT
override
;
nvinfer1
::
DimsExprs
getOutputDimensions
(
int
outputIndex
,
const
nvinfer1
::
DimsExprs
*
inputs
,
int
nbInputs
,
nvinfer1
::
IExprBuilder
&
exprBuilder
)
TRT_NOEXCEPT
override
;
bool
supportsFormatCombination
(
int
pos
,
const
nvinfer1
::
PluginTensorDesc
*
ioDesc
,
int
nbInputs
,
int
nbOutputs
)
TRT_NOEXCEPT
override
;
void
configurePlugin
(
const
nvinfer1
::
DynamicPluginTensorDesc
*
in
,
int
nbInputs
,
const
nvinfer1
::
DynamicPluginTensorDesc
*
out
,
int
nbOutputs
)
TRT_NOEXCEPT
override
;
size_t
getWorkspaceSize
(
const
nvinfer1
::
PluginTensorDesc
*
inputs
,
int
nbInputs
,
const
nvinfer1
::
PluginTensorDesc
*
outputs
,
int
nbOutputs
)
const
TRT_NOEXCEPT
override
;
int
enqueue
(
const
nvinfer1
::
PluginTensorDesc
*
inputDesc
,
const
nvinfer1
::
PluginTensorDesc
*
outputDesc
,
const
void
*
const
*
inputs
,
void
*
const
*
outputs
,
void
*
workspace
,
cudaStream_t
stream
)
TRT_NOEXCEPT
override
;
// IPluginV2Ext Methods
nvinfer1
::
DataType
getOutputDataType
(
int
index
,
const
nvinfer1
::
DataType
*
inputTypes
,
int
nbInputs
)
const
TRT_NOEXCEPT
override
;
// IPluginV2 Methods
const
char
*
getPluginType
()
const
TRT_NOEXCEPT
override
;
const
char
*
getPluginVersion
()
const
TRT_NOEXCEPT
override
;
int
getNbOutputs
()
const
TRT_NOEXCEPT
override
;
size_t
getSerializationSize
()
const
TRT_NOEXCEPT
override
;
void
serialize
(
void
*
buffer
)
const
TRT_NOEXCEPT
override
;
};
class
GatherTopkCreator
:
public
TRTPluginCreatorBase
{
public:
GatherTopkCreator
();
const
char
*
getPluginName
()
const
TRT_NOEXCEPT
override
;
const
char
*
getPluginVersion
()
const
TRT_NOEXCEPT
override
;
nvinfer1
::
IPluginV2
*
createPlugin
(
const
char
*
name
,
const
nvinfer1
::
PluginFieldCollection
*
fc
)
TRT_NOEXCEPT
override
;
nvinfer1
::
IPluginV2
*
deserializePlugin
(
const
char
*
name
,
const
void
*
serialData
,
size_t
serialLength
)
TRT_NOEXCEPT
override
;
};
}
// namespace mmdeploy
#endif // TRT_SCATTERND_HPP
csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk_kernel.cu
0 → 100644
View file @
546b4279
// Copyright (c) OpenMMLab. All rights reserved.
#include <functional>
#include <numeric>
#include <vector>
#include "common_cuda_helper.hpp"
#include "gather_topk_kernel.hpp"
#include "trt_plugin_helper.hpp"
template
<
typename
scalar_t
>
__global__
void
gather_topk_kernel
(
const
scalar_t
*
input
,
const
int
*
indices
,
scalar_t
*
output
,
int
batch
,
int
num_input
,
int
num_indices
,
int
channel
)
{
CUDA_1D_KERNEL_LOOP
(
index
,
batch
*
num_indices
*
channel
)
{
const
int
b_id
=
index
/
(
num_indices
*
channel
);
const
int
n_id
=
(
index
/
channel
)
%
num_indices
;
const
int
c_id
=
index
%
channel
;
const
int
input_n_id
=
indices
[
b_id
*
num_indices
+
n_id
];
const
scalar_t
value
=
input
[
b_id
*
num_input
*
channel
+
input_n_id
*
channel
+
c_id
];
output
[
b_id
*
num_indices
*
channel
+
n_id
*
channel
+
c_id
]
=
value
;
}
}
template
<
typename
scalar_t
>
void
gather_topk_impl
(
const
scalar_t
*
input
,
const
int
*
indices
,
const
int
*
dims
,
int
nbDims
,
const
int
*
indices_dims
,
int
indice_nbDims
,
scalar_t
*
output
,
cudaStream_t
stream
)
{
int
batch
=
1
;
for
(
int
i
=
0
;
i
<
indice_nbDims
-
1
;
++
i
)
batch
*=
dims
[
i
];
int
num_input
=
dims
[
indice_nbDims
-
1
];
int
num_indices
=
indices_dims
[
indice_nbDims
-
1
];
int
channel
=
1
;
for
(
int
i
=
indice_nbDims
;
i
<
nbDims
;
++
i
)
channel
*=
dims
[
i
];
const
int
col_block
=
DIVUP
(
batch
*
num_indices
*
channel
,
THREADS_PER_BLOCK
);
gather_topk_kernel
<<<
col_block
,
THREADS_PER_BLOCK
,
0
,
stream
>>>
(
input
,
indices
,
output
,
batch
,
num_input
,
num_indices
,
channel
);
}
template
void
gather_topk_impl
<
float
>(
const
float
*
input
,
const
int
*
indices
,
const
int
*
dims
,
int
nbDims
,
const
int
*
indices_dims
,
int
indice_nbDims
,
float
*
output
,
cudaStream_t
stream
);
template
void
gather_topk_impl
<
int32_t
>(
const
int32_t
*
input
,
const
int
*
indices
,
const
int
*
dims
,
int
nbDims
,
const
int
*
indices_dims
,
int
indice_nbDims
,
int32_t
*
output
,
cudaStream_t
stream
);
csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk_kernel.hpp
0 → 100644
View file @
546b4279
// Copyright (c) OpenMMLab. All rights reserved.
#ifndef TRT_GRID_SAMPLER_KERNEL_HPP
#define TRT_GRID_SAMPLER_KERNEL_HPP
#include <cuda_runtime.h>
template
<
typename
scalar_t
>
void
gather_topk_impl
(
const
scalar_t
*
input
,
const
int
*
indices
,
const
int
*
dims
,
int
nbDims
,
const
int
*
indices_dims
,
int
indice_nbDims
,
scalar_t
*
output
,
cudaStream_t
stream
);
#endif // TRT_GRID_SAMPLER_KERNEL_HPP
csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors.cpp
0 → 100644
View file @
546b4279
// Copyright (c) OpenMMLab. All rights reserved
#include "trt_grid_priors.hpp"
#include <assert.h>
#include <chrono>
#include "trt_grid_priors_kernel.hpp"
#include "trt_serialize.hpp"
using
namespace
nvinfer1
;
namespace
mmdeploy
{
namespace
{
static
const
char
*
PLUGIN_VERSION
{
"1"
};
static
const
char
*
PLUGIN_NAME
{
"GridPriorsTRT"
};
}
// namespace
GridPriorsTRT
::
GridPriorsTRT
(
const
std
::
string
&
name
,
const
nvinfer1
::
Dims
stride
)
:
TRTPluginBase
(
name
),
mStride
(
stride
)
{}
GridPriorsTRT
::
GridPriorsTRT
(
const
std
::
string
name
,
const
void
*
data
,
size_t
length
)
:
TRTPluginBase
(
name
)
{
deserialize_value
(
&
data
,
&
length
,
&
mStride
);
}
GridPriorsTRT
::~
GridPriorsTRT
()
{}
nvinfer1
::
IPluginV2DynamicExt
*
GridPriorsTRT
::
clone
()
const
TRT_NOEXCEPT
{
GridPriorsTRT
*
plugin
=
new
GridPriorsTRT
(
mLayerName
,
mStride
);
plugin
->
setPluginNamespace
(
getPluginNamespace
());
return
plugin
;
}
nvinfer1
::
DimsExprs
GridPriorsTRT
::
getOutputDimensions
(
int
outputIndex
,
const
nvinfer1
::
DimsExprs
*
inputs
,
int
nbInputs
,
nvinfer1
::
IExprBuilder
&
exprBuilder
)
TRT_NOEXCEPT
{
// input[0] == base_anchor
// input[1] == empty_h
// input[2] == empty_w
nvinfer1
::
DimsExprs
ret
;
ret
.
nbDims
=
2
;
auto
area
=
exprBuilder
.
operation
(
nvinfer1
::
DimensionOperation
::
kPROD
,
*
inputs
[
2
].
d
[
0
],
*
inputs
[
1
].
d
[
0
]);
ret
.
d
[
0
]
=
exprBuilder
.
operation
(
nvinfer1
::
DimensionOperation
::
kPROD
,
*
area
,
*
(
inputs
[
0
].
d
[
0
]));
ret
.
d
[
1
]
=
exprBuilder
.
constant
(
4
);
return
ret
;
}
bool
GridPriorsTRT
::
supportsFormatCombination
(
int
pos
,
const
nvinfer1
::
PluginTensorDesc
*
ioDesc
,
int
nbInputs
,
int
nbOutputs
)
TRT_NOEXCEPT
{
if
(
pos
==
0
)
{
return
(
ioDesc
[
pos
].
type
==
nvinfer1
::
DataType
::
kFLOAT
&&
ioDesc
[
pos
].
format
==
nvinfer1
::
TensorFormat
::
kLINEAR
);
}
else
if
(
pos
-
nbInputs
==
0
)
{
return
ioDesc
[
pos
].
type
==
ioDesc
[
0
].
type
&&
ioDesc
[
pos
].
format
==
ioDesc
[
0
].
format
;
}
else
{
return
true
;
}
}
int
GridPriorsTRT
::
enqueue
(
const
nvinfer1
::
PluginTensorDesc
*
inputDesc
,
const
nvinfer1
::
PluginTensorDesc
*
outputDesc
,
const
void
*
const
*
inputs
,
void
*
const
*
outputs
,
void
*
workSpace
,
cudaStream_t
stream
)
TRT_NOEXCEPT
{
int
num_base_anchors
=
inputDesc
[
0
].
dims
.
d
[
0
];
int
feat_h
=
inputDesc
[
1
].
dims
.
d
[
0
];
int
feat_w
=
inputDesc
[
2
].
dims
.
d
[
0
];
const
void
*
base_anchor
=
inputs
[
0
];
void
*
output
=
outputs
[
0
];
auto
data_type
=
inputDesc
[
0
].
type
;
switch
(
data_type
)
{
case
nvinfer1
::
DataType
::
kFLOAT
:
trt_grid_priors_impl
<
float
>
((
float
*
)
base_anchor
,
(
float
*
)
output
,
num_base_anchors
,
feat_w
,
feat_h
,
mStride
.
d
[
0
],
mStride
.
d
[
1
],
stream
);
break
;
default:
return
1
;
}
return
0
;
}
nvinfer1
::
DataType
GridPriorsTRT
::
getOutputDataType
(
int
index
,
const
nvinfer1
::
DataType
*
inputTypes
,
int
nbInputs
)
const
TRT_NOEXCEPT
{
return
inputTypes
[
0
];
}
// IPluginV2 Methods
const
char
*
GridPriorsTRT
::
getPluginType
()
const
TRT_NOEXCEPT
{
return
PLUGIN_NAME
;
}
const
char
*
GridPriorsTRT
::
getPluginVersion
()
const
TRT_NOEXCEPT
{
return
PLUGIN_VERSION
;
}
int
GridPriorsTRT
::
getNbOutputs
()
const
TRT_NOEXCEPT
{
return
1
;
}
size_t
GridPriorsTRT
::
getSerializationSize
()
const
TRT_NOEXCEPT
{
return
serialized_size
(
mStride
);
}
void
GridPriorsTRT
::
serialize
(
void
*
buffer
)
const
TRT_NOEXCEPT
{
serialize_value
(
&
buffer
,
mStride
);
;
}
////////////////////// creator /////////////////////////////
GridPriorsTRTCreator
::
GridPriorsTRTCreator
()
{
mPluginAttributes
.
clear
();
mPluginAttributes
.
emplace_back
(
nvinfer1
::
PluginField
(
"stride_h"
));
mPluginAttributes
.
emplace_back
(
nvinfer1
::
PluginField
(
"stride_w"
));
mFC
.
nbFields
=
mPluginAttributes
.
size
();
mFC
.
fields
=
mPluginAttributes
.
data
();
}
const
char
*
GridPriorsTRTCreator
::
getPluginName
()
const
TRT_NOEXCEPT
{
return
PLUGIN_NAME
;
}
const
char
*
GridPriorsTRTCreator
::
getPluginVersion
()
const
TRT_NOEXCEPT
{
return
PLUGIN_VERSION
;
}
nvinfer1
::
IPluginV2
*
GridPriorsTRTCreator
::
createPlugin
(
const
char
*
name
,
const
nvinfer1
::
PluginFieldCollection
*
fc
)
TRT_NOEXCEPT
{
int
stride_w
=
1
;
int
stride_h
=
1
;
for
(
int
i
=
0
;
i
<
fc
->
nbFields
;
i
++
)
{
if
(
fc
->
fields
[
i
].
data
==
nullptr
)
{
continue
;
}
std
::
string
field_name
(
fc
->
fields
[
i
].
name
);
if
(
field_name
.
compare
(
"stride_w"
)
==
0
)
{
stride_w
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
0
];
}
if
(
field_name
.
compare
(
"stride_h"
)
==
0
)
{
stride_h
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
0
];
}
}
nvinfer1
::
Dims
stride
{
2
,
{
stride_w
,
stride_h
}};
GridPriorsTRT
*
plugin
=
new
GridPriorsTRT
(
name
,
stride
);
plugin
->
setPluginNamespace
(
getPluginNamespace
());
return
plugin
;
}
nvinfer1
::
IPluginV2
*
GridPriorsTRTCreator
::
deserializePlugin
(
const
char
*
name
,
const
void
*
serialData
,
size_t
serialLength
)
TRT_NOEXCEPT
{
auto
plugin
=
new
GridPriorsTRT
(
name
,
serialData
,
serialLength
);
plugin
->
setPluginNamespace
(
getPluginNamespace
());
return
plugin
;
}
REGISTER_TENSORRT_PLUGIN
(
GridPriorsTRTCreator
);
}
// namespace mmdeploy
csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors.hpp
0 → 100644
View file @
546b4279
// Copyright (c) OpenMMLab. All rights reserved.
#ifndef TRT_GRID_PRIORS_HPP
#define TRT_GRID_PRIORS_HPP
#include <cublas_v2.h>
#include <memory>
#include <string>
#include <vector>
#include "trt_plugin_base.hpp"
namespace
mmdeploy
{
class
GridPriorsTRT
:
public
TRTPluginBase
{
public:
GridPriorsTRT
(
const
std
::
string
&
name
,
const
nvinfer1
::
Dims
stride
);
GridPriorsTRT
(
const
std
::
string
name
,
const
void
*
data
,
size_t
length
);
GridPriorsTRT
()
=
delete
;
~
GridPriorsTRT
()
TRT_NOEXCEPT
override
;
// IPluginV2DynamicExt Methods
nvinfer1
::
IPluginV2DynamicExt
*
clone
()
const
TRT_NOEXCEPT
override
;
nvinfer1
::
DimsExprs
getOutputDimensions
(
int
outputIndex
,
const
nvinfer1
::
DimsExprs
*
inputs
,
int
nbInputs
,
nvinfer1
::
IExprBuilder
&
exprBuilder
)
TRT_NOEXCEPT
override
;
bool
supportsFormatCombination
(
int
pos
,
const
nvinfer1
::
PluginTensorDesc
*
ioDesc
,
int
nbInputs
,
int
nbOutputs
)
TRT_NOEXCEPT
override
;
int
enqueue
(
const
nvinfer1
::
PluginTensorDesc
*
inputDesc
,
const
nvinfer1
::
PluginTensorDesc
*
outputDesc
,
const
void
*
const
*
inputs
,
void
*
const
*
outputs
,
void
*
workspace
,
cudaStream_t
stream
)
TRT_NOEXCEPT
override
;
// IPluginV2Ext Methods
nvinfer1
::
DataType
getOutputDataType
(
int
index
,
const
nvinfer1
::
DataType
*
inputTypes
,
int
nbInputs
)
const
TRT_NOEXCEPT
override
;
// IPluginV2 Methods
const
char
*
getPluginType
()
const
TRT_NOEXCEPT
override
;
const
char
*
getPluginVersion
()
const
TRT_NOEXCEPT
override
;
int
getNbOutputs
()
const
TRT_NOEXCEPT
override
;
size_t
getSerializationSize
()
const
TRT_NOEXCEPT
override
;
void
serialize
(
void
*
buffer
)
const
TRT_NOEXCEPT
override
;
private:
nvinfer1
::
Dims
mStride
;
cublasHandle_t
m_cublas_handle
;
};
class
GridPriorsTRTCreator
:
public
TRTPluginCreatorBase
{
public:
GridPriorsTRTCreator
();
const
char
*
getPluginName
()
const
TRT_NOEXCEPT
override
;
const
char
*
getPluginVersion
()
const
TRT_NOEXCEPT
override
;
nvinfer1
::
IPluginV2
*
createPlugin
(
const
char
*
name
,
const
nvinfer1
::
PluginFieldCollection
*
fc
)
TRT_NOEXCEPT
override
;
nvinfer1
::
IPluginV2
*
deserializePlugin
(
const
char
*
name
,
const
void
*
serialData
,
size_t
serialLength
)
TRT_NOEXCEPT
override
;
};
}
// namespace mmdeploy
#endif // TRT_GRID_PRIORS_HPP
csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors_kernel.cu
0 → 100644
View file @
546b4279
// Copyright (c) OpenMMLab. All rights reserved
#include <cuda_fp16.h>
#include "common_cuda_helper.hpp"
#include "trt_grid_priors_kernel.hpp"
#include "trt_plugin_helper.hpp"
template
<
typename
scalar_t
>
__global__
void
trt_grid_priors_kernel
(
const
scalar_t
*
base_anchor
,
scalar_t
*
output
,
int
num_base_anchors
,
int
feat_w
,
int
feat_h
,
int
stride_w
,
int
stride_h
)
{
// load base anchor into shared memory.
extern
__shared__
scalar_t
shared_base_anchor
[];
for
(
int
i
=
threadIdx
.
x
;
i
<
num_base_anchors
*
4
;
i
+=
blockDim
.
x
)
{
shared_base_anchor
[
i
]
=
base_anchor
[
i
];
}
__syncthreads
();
CUDA_1D_KERNEL_LOOP
(
index
,
num_base_anchors
*
feat_w
*
feat_h
)
{
const
int
a_offset
=
(
index
%
num_base_anchors
)
<<
2
;
const
scalar_t
w
=
scalar_t
(((
index
/
num_base_anchors
)
%
feat_w
)
*
stride_w
);
const
scalar_t
h
=
scalar_t
((
index
/
(
feat_w
*
num_base_anchors
))
*
stride_h
);
auto
out_start
=
output
+
index
*
4
;
out_start
[
0
]
=
shared_base_anchor
[
a_offset
]
+
w
;
out_start
[
1
]
=
shared_base_anchor
[
a_offset
+
1
]
+
h
;
out_start
[
2
]
=
shared_base_anchor
[
a_offset
+
2
]
+
w
;
out_start
[
3
]
=
shared_base_anchor
[
a_offset
+
3
]
+
h
;
}
}
template
<
typename
scalar_t
>
void
trt_grid_priors_impl
(
const
scalar_t
*
base_anchor
,
scalar_t
*
output
,
int
num_base_anchors
,
int
feat_w
,
int
feat_h
,
int
stride_w
,
int
stride_h
,
cudaStream_t
stream
)
{
trt_grid_priors_kernel
<<<
GET_BLOCKS
(
num_base_anchors
*
feat_w
*
feat_h
),
THREADS_PER_BLOCK
,
DIVUP
(
num_base_anchors
*
4
,
32
)
*
32
*
sizeof
(
scalar_t
),
stream
>>>
(
base_anchor
,
output
,
(
int
)
num_base_anchors
,
(
int
)
feat_w
,
(
int
)
feat_h
,
(
int
)
stride_w
,
(
int
)
stride_h
);
}
template
void
trt_grid_priors_impl
<
float
>(
const
float
*
base_anchor
,
float
*
output
,
int
num_base_anchors
,
int
feat_w
,
int
feat_h
,
int
stride_w
,
int
stride_h
,
cudaStream_t
stream
);
csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors_kernel.hpp
0 → 100644
View file @
546b4279
// Copyright (c) OpenMMLab. All rights reserved
#ifndef TRT_GRID_PRIORS_KERNEL_HPP
#define TRT_GRID_PRIORS_KERNEL_HPP
#include <cuda_runtime.h>
template
<
typename
scalar_t
>
void
trt_grid_priors_impl
(
const
scalar_t
*
base_anchor
,
scalar_t
*
output
,
int
num_base_anchors
,
int
feat_w
,
int
feat_h
,
int
stride_w
,
int
stride_h
,
cudaStream_t
stream
);
#endif
csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler.cpp
0 → 100644
View file @
546b4279
// Copyright (c) OpenMMLab. All rights reserved.
#include "trt_grid_sampler.hpp"
#include <assert.h>
#include <chrono>
#include "trt_grid_sampler_kernel.hpp"
#include "trt_plugin_helper.hpp"
#include "trt_serialize.hpp"
namespace
mmdeploy
{
namespace
{
static
const
char
*
PLUGIN_VERSION
{
"1"
};
static
const
char
*
PLUGIN_NAME
{
"grid_sampler"
};
}
// namespace
TRTGridSampler
::
TRTGridSampler
(
const
std
::
string
&
name
,
int
mode
,
int
paddingMode
,
bool
alignCorners
)
:
TRTPluginBase
(
name
),
mMode
(
mode
),
mPaddingMode
(
paddingMode
),
mAlignCorners
(
alignCorners
)
{}
TRTGridSampler
::
TRTGridSampler
(
const
std
::
string
name
,
const
void
*
data
,
size_t
length
)
:
TRTPluginBase
(
name
)
{
deserialize_value
(
&
data
,
&
length
,
&
mMode
);
deserialize_value
(
&
data
,
&
length
,
&
mPaddingMode
);
deserialize_value
(
&
data
,
&
length
,
&
mAlignCorners
);
}
nvinfer1
::
IPluginV2DynamicExt
*
TRTGridSampler
::
clone
()
const
TRT_NOEXCEPT
{
TRTGridSampler
*
plugin
=
new
TRTGridSampler
(
mLayerName
,
mMode
,
mPaddingMode
,
mAlignCorners
);
plugin
->
setPluginNamespace
(
getPluginNamespace
());
return
plugin
;
}
nvinfer1
::
DimsExprs
TRTGridSampler
::
getOutputDimensions
(
int
outputIndex
,
const
nvinfer1
::
DimsExprs
*
inputs
,
int
nbInputs
,
nvinfer1
::
IExprBuilder
&
exprBuilder
)
TRT_NOEXCEPT
{
nvinfer1
::
DimsExprs
ret
;
ret
.
nbDims
=
inputs
[
0
].
nbDims
;
ret
.
d
[
0
]
=
inputs
[
0
].
d
[
0
];
ret
.
d
[
1
]
=
inputs
[
0
].
d
[
1
];
for
(
int
i
=
2
;
i
<
ret
.
nbDims
;
++
i
)
{
ret
.
d
[
i
]
=
inputs
[
1
].
d
[
i
-
1
];
}
return
ret
;
}
bool
TRTGridSampler
::
supportsFormatCombination
(
int
pos
,
const
nvinfer1
::
PluginTensorDesc
*
ioDesc
,
int
nbInputs
,
int
nbOutputs
)
TRT_NOEXCEPT
{
if
(
pos
==
0
)
{
return
(
ioDesc
[
pos
].
type
==
nvinfer1
::
DataType
::
kFLOAT
&&
ioDesc
[
pos
].
format
==
nvinfer1
::
TensorFormat
::
kLINEAR
);
}
else
{
return
ioDesc
[
pos
].
type
==
ioDesc
[
0
].
type
&&
ioDesc
[
pos
].
format
==
ioDesc
[
0
].
format
;
}
}
void
TRTGridSampler
::
configurePlugin
(
const
nvinfer1
::
DynamicPluginTensorDesc
*
inputs
,
int
nbInputs
,
const
nvinfer1
::
DynamicPluginTensorDesc
*
outputs
,
int
nbOutputs
)
TRT_NOEXCEPT
{
// Validate input arguments
}
size_t
TRTGridSampler
::
getWorkspaceSize
(
const
nvinfer1
::
PluginTensorDesc
*
inputs
,
int
nbInputs
,
const
nvinfer1
::
PluginTensorDesc
*
outputs
,
int
nbOutputs
)
const
TRT_NOEXCEPT
{
return
0
;
}
int
TRTGridSampler
::
enqueue
(
const
nvinfer1
::
PluginTensorDesc
*
inputDesc
,
const
nvinfer1
::
PluginTensorDesc
*
outputDesc
,
const
void
*
const
*
inputs
,
void
*
const
*
outputs
,
void
*
workSpace
,
cudaStream_t
stream
)
TRT_NOEXCEPT
{
nvinfer1
::
Dims
input_dims
=
inputDesc
[
0
].
dims
;
nvinfer1
::
Dims
grid_dims
=
inputDesc
[
1
].
dims
;
nvinfer1
::
Dims
output_dims
=
outputDesc
[
0
].
dims
;
GridSamplerInterpolation
interp_mode
=
GridSamplerInterpolation
::
Bilinear
;
switch
(
mMode
)
{
case
0
:
interp_mode
=
GridSamplerInterpolation
::
Bilinear
;
break
;
case
1
:
interp_mode
=
GridSamplerInterpolation
::
Nearest
;
break
;
default:
break
;
}
GridSamplerPadding
padding_mode
=
GridSamplerPadding
::
Zeros
;
switch
(
mPaddingMode
)
{
case
0
:
padding_mode
=
GridSamplerPadding
::
Zeros
;
break
;
case
1
:
padding_mode
=
GridSamplerPadding
::
Border
;
break
;
case
2
:
padding_mode
=
GridSamplerPadding
::
Reflection
;
break
;
default:
break
;
}
auto
data_type
=
inputDesc
[
0
].
type
;
switch
(
data_type
)
{
case
nvinfer1
::
DataType
::
kFLOAT
:
grid_sample
<
float
>
((
float
*
)
outputs
[
0
],
(
float
*
)
inputs
[
0
],
(
float
*
)
inputs
[
1
],
&
(
output_dims
.
d
[
0
]),
&
(
input_dims
.
d
[
0
]),
&
(
grid_dims
.
d
[
0
]),
input_dims
.
nbDims
,
interp_mode
,
padding_mode
,
mAlignCorners
,
stream
);
break
;
default:
return
1
;
break
;
}
return
0
;
}
nvinfer1
::
DataType
TRTGridSampler
::
getOutputDataType
(
int
index
,
const
nvinfer1
::
DataType
*
inputTypes
,
int
nbInputs
)
const
TRT_NOEXCEPT
{
return
inputTypes
[
0
];
}
// IPluginV2 Methods
const
char
*
TRTGridSampler
::
getPluginType
()
const
TRT_NOEXCEPT
{
return
PLUGIN_NAME
;
}
const
char
*
TRTGridSampler
::
getPluginVersion
()
const
TRT_NOEXCEPT
{
return
PLUGIN_VERSION
;
}
int
TRTGridSampler
::
getNbOutputs
()
const
TRT_NOEXCEPT
{
return
1
;
}
size_t
TRTGridSampler
::
getSerializationSize
()
const
TRT_NOEXCEPT
{
return
serialized_size
(
mMode
)
+
serialized_size
(
mPaddingMode
)
+
serialized_size
(
mAlignCorners
);
}
void
TRTGridSampler
::
serialize
(
void
*
buffer
)
const
TRT_NOEXCEPT
{
serialize_value
(
&
buffer
,
mMode
);
serialize_value
(
&
buffer
,
mPaddingMode
);
serialize_value
(
&
buffer
,
mAlignCorners
);
}
////////////////////// creator /////////////////////////////
TRTGridSamplerCreator
::
TRTGridSamplerCreator
()
{
mPluginAttributes
=
std
::
vector
<
nvinfer1
::
PluginField
>
(
{
nvinfer1
::
PluginField
(
"interpolation_mode"
),
nvinfer1
::
PluginField
(
"padding_mode"
),
nvinfer1
::
PluginField
(
"align_corners"
)});
mFC
.
nbFields
=
mPluginAttributes
.
size
();
mFC
.
fields
=
mPluginAttributes
.
data
();
}
const
char
*
TRTGridSamplerCreator
::
getPluginName
()
const
TRT_NOEXCEPT
{
return
PLUGIN_NAME
;
}
const
char
*
TRTGridSamplerCreator
::
getPluginVersion
()
const
TRT_NOEXCEPT
{
return
PLUGIN_VERSION
;
}
nvinfer1
::
IPluginV2
*
TRTGridSamplerCreator
::
createPlugin
(
const
char
*
name
,
const
nvinfer1
::
PluginFieldCollection
*
fc
)
TRT_NOEXCEPT
{
int
mode
=
0
;
int
paddingMode
=
0
;
bool
alignCorners
=
false
;
for
(
int
i
=
0
;
i
<
fc
->
nbFields
;
i
++
)
{
if
(
fc
->
fields
[
i
].
data
==
nullptr
)
{
continue
;
}
std
::
string
field_name
(
fc
->
fields
[
i
].
name
);
if
(
field_name
.
compare
(
"interpolation_mode"
)
==
0
)
{
mode
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
0
];
}
if
(
field_name
.
compare
(
"padding_mode"
)
==
0
)
{
paddingMode
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
0
];
}
if
(
field_name
.
compare
(
"align_corners"
)
==
0
)
{
alignCorners
=
(
bool
)(
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
0
]);
}
}
TRTGridSampler
*
plugin
=
new
TRTGridSampler
(
name
,
mode
,
paddingMode
,
alignCorners
);
plugin
->
setPluginNamespace
(
getPluginNamespace
());
return
plugin
;
}
nvinfer1
::
IPluginV2
*
TRTGridSamplerCreator
::
deserializePlugin
(
const
char
*
name
,
const
void
*
serialData
,
size_t
serialLength
)
TRT_NOEXCEPT
{
// This object will be deleted when the network is destroyed, which will
// call FCPluginDynamic::destroy()
auto
plugin
=
new
TRTGridSampler
(
name
,
serialData
,
serialLength
);
plugin
->
setPluginNamespace
(
getPluginNamespace
());
return
plugin
;
}
REGISTER_TENSORRT_PLUGIN
(
TRTGridSamplerCreator
);
}
// namespace mmdeploy
csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler.hpp
0 → 100644
View file @
546b4279
// Copyright (c) OpenMMLab. All rights reserved.
#ifndef TRT_GRID_SAMPLER_HPP
#define TRT_GRID_SAMPLER_HPP
#include <cublas_v2.h>
#include <memory>
#include <string>
#include <vector>
#include "trt_plugin_base.hpp"
namespace
mmdeploy
{
class
TRTGridSampler
:
public
TRTPluginBase
{
public:
TRTGridSampler
(
const
std
::
string
&
name
,
int
mode
,
int
paddingMode
,
bool
alignCorners
);
TRTGridSampler
(
const
std
::
string
name
,
const
void
*
data
,
size_t
length
);
TRTGridSampler
()
=
delete
;
~
TRTGridSampler
()
TRT_NOEXCEPT
override
=
default
;
// IPluginV2DynamicExt Methods
nvinfer1
::
IPluginV2DynamicExt
*
clone
()
const
TRT_NOEXCEPT
override
;
nvinfer1
::
DimsExprs
getOutputDimensions
(
int
outputIndex
,
const
nvinfer1
::
DimsExprs
*
inputs
,
int
nbInputs
,
nvinfer1
::
IExprBuilder
&
exprBuilder
)
TRT_NOEXCEPT
override
;
bool
supportsFormatCombination
(
int
pos
,
const
nvinfer1
::
PluginTensorDesc
*
ioDesc
,
int
nbInputs
,
int
nbOutputs
)
TRT_NOEXCEPT
override
;
void
configurePlugin
(
const
nvinfer1
::
DynamicPluginTensorDesc
*
in
,
int
nbInputs
,
const
nvinfer1
::
DynamicPluginTensorDesc
*
out
,
int
nbOutputs
)
TRT_NOEXCEPT
override
;
size_t
getWorkspaceSize
(
const
nvinfer1
::
PluginTensorDesc
*
inputs
,
int
nbInputs
,
const
nvinfer1
::
PluginTensorDesc
*
outputs
,
int
nbOutputs
)
const
TRT_NOEXCEPT
override
;
int
enqueue
(
const
nvinfer1
::
PluginTensorDesc
*
inputDesc
,
const
nvinfer1
::
PluginTensorDesc
*
outputDesc
,
const
void
*
const
*
inputs
,
void
*
const
*
outputs
,
void
*
workspace
,
cudaStream_t
stream
)
TRT_NOEXCEPT
override
;
// IPluginV2Ext Methods
nvinfer1
::
DataType
getOutputDataType
(
int
index
,
const
nvinfer1
::
DataType
*
inputTypes
,
int
nbInputs
)
const
TRT_NOEXCEPT
override
;
// IPluginV2 Methods
const
char
*
getPluginType
()
const
TRT_NOEXCEPT
override
;
const
char
*
getPluginVersion
()
const
TRT_NOEXCEPT
override
;
int
getNbOutputs
()
const
TRT_NOEXCEPT
override
;
size_t
getSerializationSize
()
const
TRT_NOEXCEPT
override
;
void
serialize
(
void
*
buffer
)
const
TRT_NOEXCEPT
override
;
private:
int
mMode
;
int
mPaddingMode
;
bool
mAlignCorners
;
};
class
TRTGridSamplerCreator
:
public
TRTPluginCreatorBase
{
public:
TRTGridSamplerCreator
();
~
TRTGridSamplerCreator
()
TRT_NOEXCEPT
override
=
default
;
const
char
*
getPluginName
()
const
TRT_NOEXCEPT
override
;
const
char
*
getPluginVersion
()
const
TRT_NOEXCEPT
override
;
nvinfer1
::
IPluginV2
*
createPlugin
(
const
char
*
name
,
const
nvinfer1
::
PluginFieldCollection
*
fc
)
TRT_NOEXCEPT
override
;
nvinfer1
::
IPluginV2
*
deserializePlugin
(
const
char
*
name
,
const
void
*
serialData
,
size_t
serialLength
)
TRT_NOEXCEPT
override
;
};
}
// namespace mmdeploy
#endif // TRT_GRID_SAMPLER_HPP
csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler_kernel.cu
0 → 100644
View file @
546b4279
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
// modified from
// https://github.com/pytorch/pytorch/blob/ec683299ebabf297a3504c76248d37be830e4342/aten/src/ATen/native/cuda/GridSampler.cuh
// and
// https://github.com/pytorch/pytorch/blob/ec683299ebabf297a3504c76248d37be830e4342/aten/src/ATen/native/cuda/GridSampler.cu
#include <cuda_fp16.h>
#include <stdio.h>
#include <algorithm>
#include <cmath>
#include <vector>
#include "common_cuda_helper.hpp"
#include "trt_grid_sampler_kernel.hpp"
#include "trt_plugin_helper.hpp"
using
mmdeploy
::
TensorDesc
;
// Unnormalizes a coordinate from the -1 to +1 scale to its pixel index value,
// where we view each pixel as an area between (idx - 0.5) and (idx + 0.5).
// if align_corners: -1 and +1 get sent to the centers of the corner pixels
// -1 --> 0
// +1 --> (size - 1)
// scale_factor = (size - 1) / 2
// if not align_corners: -1 and +1 get sent to the image edges
// -1 --> -0.5
// +1 --> (size - 1) + 0.5 == size - 0.5
// scale_factor = size / 2
template
<
typename
scalar_t
>
static
__forceinline__
__device__
scalar_t
grid_sampler_unnormalize
(
scalar_t
coord
,
int
size
,
bool
align_corners
)
{
if
(
align_corners
)
{
// unnormalize coord from [-1, 1] to [0, size - 1]
return
((
coord
+
1.
f
)
/
2
)
*
(
size
-
1
);
}
else
{
// unnormalize coord from [-1, 1] to [-0.5, size - 0.5]
return
((
coord
+
1.
f
)
*
size
-
1
)
/
2
;
}
}
// Clips coordinates to between 0 and clip_limit - 1
template
<
typename
scalar_t
>
static
__forceinline__
__device__
scalar_t
clip_coordinates
(
scalar_t
in
,
int
clip_limit
)
{
return
::
min
(
static_cast
<
scalar_t
>
(
clip_limit
-
1
),
::
max
(
in
,
static_cast
<
scalar_t
>
(
0
)));
}
// Reflects coordinates until they fall between low and high (inclusive).
// The bounds are passed as twice their value so that half-integer values
// can be represented as ints.
template
<
typename
scalar_t
>
static
__forceinline__
__device__
scalar_t
reflect_coordinates
(
scalar_t
in
,
int
twice_low
,
int
twice_high
)
{
if
(
twice_low
==
twice_high
)
{
return
static_cast
<
scalar_t
>
(
0
);
}
scalar_t
min
=
static_cast
<
scalar_t
>
(
twice_low
)
/
2
;
scalar_t
span
=
static_cast
<
scalar_t
>
(
twice_high
-
twice_low
)
/
2
;
in
=
::
fabs
(
in
-
min
);
// `fmod` returns same sign as `in`, which is positive after the `fabs` above.
scalar_t
extra
=
::
fmod
(
in
,
span
);
int
flips
=
static_cast
<
int
>
(
::
floor
(
in
/
span
));
if
(
flips
%
2
==
0
)
{
return
extra
+
min
;
}
else
{
return
span
-
extra
+
min
;
}
}
template
<
typename
scalar_t
>
static
__forceinline__
__device__
scalar_t
safe_downgrade_to_int_range
(
scalar_t
x
)
{
// -100.0 does not have special meaning. This is just to make sure
// it's not within_bounds_2d or within_bounds_3d, and does not cause
// undefined behavior. See #35506.
if
(
x
>
INT_MAX
-
1
||
x
<
INT_MIN
||
!::
isfinite
(
static_cast
<
double
>
(
x
)))
return
static_cast
<
scalar_t
>
(
-
100.0
);
return
x
;
}
// Computes the pixel source index value for a grid coordinate
template
<
typename
scalar_t
>
static
__forceinline__
__device__
scalar_t
grid_sampler_compute_source_index
(
scalar_t
coord
,
int
size
,
GridSamplerPadding
padding_mode
,
bool
align_corners
)
{
coord
=
grid_sampler_unnormalize
(
coord
,
size
,
align_corners
);
if
(
padding_mode
==
GridSamplerPadding
::
Border
)
{
// clip coordinates to image borders
coord
=
clip_coordinates
(
coord
,
size
);
}
else
if
(
padding_mode
==
GridSamplerPadding
::
Reflection
)
{
// reflect coordinates by image borders
if
(
align_corners
)
{
coord
=
reflect_coordinates
(
coord
,
0
,
2
*
(
size
-
1
));
}
else
{
coord
=
reflect_coordinates
(
coord
,
-
1
,
2
*
size
-
1
);
}
// clip coordinates to image borders
coord
=
clip_coordinates
(
coord
,
size
);
}
coord
=
safe_downgrade_to_int_range
(
coord
);
return
coord
;
}
static
__forceinline__
__device__
bool
within_bounds_2d
(
int
h
,
int
w
,
int
H
,
int
W
)
{
return
h
>=
0
&&
h
<
H
&&
w
>=
0
&&
w
<
W
;
}
static
__forceinline__
__device__
bool
within_bounds_3d
(
int
d
,
int
h
,
int
w
,
int
D
,
int
H
,
int
W
)
{
return
d
>=
0
&&
d
<
D
&&
h
>=
0
&&
h
<
H
&&
w
>=
0
&&
w
<
W
;
}
template
<
typename
scalar_t
>
__global__
void
grid_sampler_2d_kernel
(
const
int
nthreads
,
const
scalar_t
*
input
,
const
scalar_t
*
grid
,
scalar_t
*
output
,
TensorDesc
input_desc
,
TensorDesc
grid_desc
,
TensorDesc
output_desc
,
const
GridSamplerInterpolation
interpolation_mode
,
const
GridSamplerPadding
padding_mode
,
bool
align_corners
)
{
int
C
=
input_desc
.
shape
[
1
];
int
inp_H
=
input_desc
.
shape
[
2
];
int
inp_W
=
input_desc
.
shape
[
3
];
int
out_H
=
grid_desc
.
shape
[
1
];
int
out_W
=
grid_desc
.
shape
[
2
];
int
inp_sN
=
input_desc
.
stride
[
0
];
int
inp_sC
=
input_desc
.
stride
[
1
];
int
inp_sH
=
input_desc
.
stride
[
2
];
int
inp_sW
=
input_desc
.
stride
[
3
];
int
grid_sN
=
grid_desc
.
stride
[
0
];
int
grid_sH
=
grid_desc
.
stride
[
1
];
int
grid_sW
=
grid_desc
.
stride
[
2
];
int
grid_sCoor
=
grid_desc
.
stride
[
3
];
int
out_sN
=
output_desc
.
stride
[
0
];
int
out_sC
=
output_desc
.
stride
[
1
];
int
out_sH
=
output_desc
.
stride
[
2
];
int
out_sW
=
output_desc
.
stride
[
3
];
CUDA_1D_KERNEL_LOOP
(
index
,
nthreads
)
{
const
int
w
=
index
%
out_W
;
const
int
h
=
(
index
/
out_W
)
%
out_H
;
const
int
n
=
index
/
(
out_H
*
out_W
);
const
int
grid_offset
=
n
*
grid_sN
+
h
*
grid_sH
+
w
*
grid_sW
;
// get the corresponding input x, y coordinates from grid
scalar_t
ix
=
grid
[
grid_offset
];
scalar_t
iy
=
grid
[
grid_offset
+
grid_sCoor
];
ix
=
grid_sampler_compute_source_index
(
ix
,
inp_W
,
padding_mode
,
align_corners
);
iy
=
grid_sampler_compute_source_index
(
iy
,
inp_H
,
padding_mode
,
align_corners
);
if
(
interpolation_mode
==
GridSamplerInterpolation
::
Bilinear
)
{
// get NE, NW, SE, SW pixel values from (x, y)
int
ix_nw
=
static_cast
<
int
>
(
::
floor
(
ix
));
int
iy_nw
=
static_cast
<
int
>
(
::
floor
(
iy
));
int
ix_ne
=
ix_nw
+
1
;
int
iy_ne
=
iy_nw
;
int
ix_sw
=
ix_nw
;
int
iy_sw
=
iy_nw
+
1
;
int
ix_se
=
ix_nw
+
1
;
int
iy_se
=
iy_nw
+
1
;
// get surfaces to each neighbor:
scalar_t
nw
=
(
ix_se
-
ix
)
*
(
iy_se
-
iy
);
scalar_t
ne
=
(
ix
-
ix_sw
)
*
(
iy_sw
-
iy
);
scalar_t
sw
=
(
ix_ne
-
ix
)
*
(
iy
-
iy_ne
);
scalar_t
se
=
(
ix
-
ix_nw
)
*
(
iy
-
iy_nw
);
// calculate bilinear weighted pixel value and set output pixel
auto
inp_ptr_NC
=
input
+
n
*
inp_sN
;
auto
out_ptr_NCHW
=
output
+
n
*
out_sN
+
h
*
out_sH
+
w
*
out_sW
;
for
(
int
c
=
0
;
c
<
C
;
++
c
,
inp_ptr_NC
+=
inp_sC
,
out_ptr_NCHW
+=
out_sC
)
{
*
out_ptr_NCHW
=
static_cast
<
scalar_t
>
(
0
);
if
(
within_bounds_2d
(
iy_nw
,
ix_nw
,
inp_H
,
inp_W
))
{
*
out_ptr_NCHW
+=
inp_ptr_NC
[
iy_nw
*
inp_sH
+
ix_nw
*
inp_sW
]
*
nw
;
}
if
(
within_bounds_2d
(
iy_ne
,
ix_ne
,
inp_H
,
inp_W
))
{
*
out_ptr_NCHW
+=
inp_ptr_NC
[
iy_ne
*
inp_sH
+
ix_ne
*
inp_sW
]
*
ne
;
}
if
(
within_bounds_2d
(
iy_sw
,
ix_sw
,
inp_H
,
inp_W
))
{
*
out_ptr_NCHW
+=
inp_ptr_NC
[
iy_sw
*
inp_sH
+
ix_sw
*
inp_sW
]
*
sw
;
}
if
(
within_bounds_2d
(
iy_se
,
ix_se
,
inp_H
,
inp_W
))
{
*
out_ptr_NCHW
+=
inp_ptr_NC
[
iy_se
*
inp_sH
+
ix_se
*
inp_sW
]
*
se
;
}
}
}
else
if
(
interpolation_mode
==
GridSamplerInterpolation
::
Nearest
)
{
int
ix_nearest
=
static_cast
<
int
>
(
::
round
(
ix
));
int
iy_nearest
=
static_cast
<
int
>
(
::
round
(
iy
));
// assign nearest neighbor pixel value to output pixel
auto
inp_ptr_NC
=
input
+
n
*
inp_sN
;
auto
out_ptr_NCHW
=
output
+
n
*
out_sN
+
h
*
out_sH
+
w
*
out_sW
;
for
(
int
c
=
0
;
c
<
C
;
++
c
,
inp_ptr_NC
+=
inp_sC
,
out_ptr_NCHW
+=
out_sC
)
{
if
(
within_bounds_2d
(
iy_nearest
,
ix_nearest
,
inp_H
,
inp_W
))
{
*
out_ptr_NCHW
=
inp_ptr_NC
[
iy_nearest
*
inp_sH
+
ix_nearest
*
inp_sW
];
}
else
{
*
out_ptr_NCHW
=
static_cast
<
scalar_t
>
(
0
);
}
}
}
}
}
template
<
typename
scalar_t
>
__global__
void
grid_sampler_3d_kernel
(
const
int
nthreads
,
const
scalar_t
*
input
,
const
scalar_t
*
grid
,
scalar_t
*
output
,
TensorDesc
input_desc
,
TensorDesc
grid_desc
,
TensorDesc
output_desc
,
const
GridSamplerInterpolation
interpolation_mode
,
const
GridSamplerPadding
padding_mode
,
bool
align_corners
)
{
int
C
=
input_desc
.
shape
[
1
];
int
inp_D
=
input_desc
.
shape
[
2
];
int
inp_H
=
input_desc
.
shape
[
3
];
int
inp_W
=
input_desc
.
shape
[
4
];
int
out_D
=
grid_desc
.
shape
[
1
];
int
out_H
=
grid_desc
.
shape
[
2
];
int
out_W
=
grid_desc
.
shape
[
3
];
int
inp_sN
=
input_desc
.
stride
[
0
];
int
inp_sC
=
input_desc
.
stride
[
1
];
int
inp_sD
=
input_desc
.
stride
[
2
];
int
inp_sH
=
input_desc
.
stride
[
3
];
int
inp_sW
=
input_desc
.
stride
[
4
];
int
grid_sN
=
grid_desc
.
stride
[
0
];
int
grid_sD
=
grid_desc
.
stride
[
1
];
int
grid_sH
=
grid_desc
.
stride
[
2
];
int
grid_sW
=
grid_desc
.
stride
[
3
];
int
grid_sCoor
=
grid_desc
.
stride
[
4
];
int
out_sN
=
output_desc
.
stride
[
0
];
int
out_sC
=
output_desc
.
stride
[
1
];
int
out_sD
=
output_desc
.
stride
[
2
];
int
out_sH
=
output_desc
.
stride
[
3
];
int
out_sW
=
output_desc
.
stride
[
4
];
CUDA_1D_KERNEL_LOOP
(
index
,
nthreads
)
{
const
int
w
=
index
%
out_W
;
const
int
h
=
(
index
/
out_W
)
%
out_H
;
const
int
d
=
(
index
/
(
out_H
*
out_W
))
%
out_D
;
const
int
n
=
index
/
(
out_D
*
out_H
*
out_W
);
const
int
grid_offset
=
n
*
grid_sN
+
d
*
grid_sD
+
h
*
grid_sH
+
w
*
grid_sW
;
// get the corresponding input x, y, z coordinates from grid
scalar_t
ix
=
grid
[
grid_offset
];
scalar_t
iy
=
grid
[
grid_offset
+
grid_sCoor
];
scalar_t
iz
=
grid
[
grid_offset
+
2
*
grid_sCoor
];
ix
=
grid_sampler_compute_source_index
(
ix
,
inp_W
,
padding_mode
,
align_corners
);
iy
=
grid_sampler_compute_source_index
(
iy
,
inp_H
,
padding_mode
,
align_corners
);
iz
=
grid_sampler_compute_source_index
(
iz
,
inp_D
,
padding_mode
,
align_corners
);
if
(
interpolation_mode
==
GridSamplerInterpolation
::
Bilinear
)
{
// get corner pixel values from (x, y, z)
// for 4d, we used north-east-south-west
// for 5d, we add top-bottom
int
ix_tnw
=
static_cast
<
int
>
(
::
floor
(
ix
));
int
iy_tnw
=
static_cast
<
int
>
(
::
floor
(
iy
));
int
iz_tnw
=
static_cast
<
int
>
(
::
floor
(
iz
));
int
ix_tne
=
ix_tnw
+
1
;
int
iy_tne
=
iy_tnw
;
int
iz_tne
=
iz_tnw
;
int
ix_tsw
=
ix_tnw
;
int
iy_tsw
=
iy_tnw
+
1
;
int
iz_tsw
=
iz_tnw
;
int
ix_tse
=
ix_tnw
+
1
;
int
iy_tse
=
iy_tnw
+
1
;
int
iz_tse
=
iz_tnw
;
int
ix_bnw
=
ix_tnw
;
int
iy_bnw
=
iy_tnw
;
int
iz_bnw
=
iz_tnw
+
1
;
int
ix_bne
=
ix_tnw
+
1
;
int
iy_bne
=
iy_tnw
;
int
iz_bne
=
iz_tnw
+
1
;
int
ix_bsw
=
ix_tnw
;
int
iy_bsw
=
iy_tnw
+
1
;
int
iz_bsw
=
iz_tnw
+
1
;
int
ix_bse
=
ix_tnw
+
1
;
int
iy_bse
=
iy_tnw
+
1
;
int
iz_bse
=
iz_tnw
+
1
;
// get surfaces to each neighbor:
scalar_t
tnw
=
(
ix_bse
-
ix
)
*
(
iy_bse
-
iy
)
*
(
iz_bse
-
iz
);
scalar_t
tne
=
(
ix
-
ix_bsw
)
*
(
iy_bsw
-
iy
)
*
(
iz_bsw
-
iz
);
scalar_t
tsw
=
(
ix_bne
-
ix
)
*
(
iy
-
iy_bne
)
*
(
iz_bne
-
iz
);
scalar_t
tse
=
(
ix
-
ix_bnw
)
*
(
iy
-
iy_bnw
)
*
(
iz_bnw
-
iz
);
scalar_t
bnw
=
(
ix_tse
-
ix
)
*
(
iy_tse
-
iy
)
*
(
iz
-
iz_tse
);
scalar_t
bne
=
(
ix
-
ix_tsw
)
*
(
iy_tsw
-
iy
)
*
(
iz
-
iz_tsw
);
scalar_t
bsw
=
(
ix_tne
-
ix
)
*
(
iy
-
iy_tne
)
*
(
iz
-
iz_tne
);
scalar_t
bse
=
(
ix
-
ix_tnw
)
*
(
iy
-
iy_tnw
)
*
(
iz
-
iz_tnw
);
auto
inp_ptr_NC
=
input
+
n
*
inp_sN
;
auto
out_ptr_NCDHW
=
output
+
n
*
out_sN
+
d
*
out_sD
+
h
*
out_sH
+
w
*
out_sW
;
for
(
int
c
=
0
;
c
<
C
;
++
c
,
inp_ptr_NC
+=
inp_sC
,
out_ptr_NCDHW
+=
out_sC
)
{
// (c, iz_tnw, iy_tnw, ix_tnw) * tnw + (c, iz_tne, iy_tne, ix_tne) *
// tne
// + (c, iz_tsw, iy_tsw, ix_tsw) * tsw + (c, iz_tse, iy_tse, ix_tse) *
// tse
// + (c, iz_bnw, iy_bnw, ix_bnw) * bnw + (c, iz_bne, iy_bne, ix_bne) *
// bne
// + (c, iz_bsw, iy_bsw, ix_bsw) * bsw + (c, iz_bse, iy_bse, ix_bse) *
// bse
*
out_ptr_NCDHW
=
static_cast
<
scalar_t
>
(
0
);
if
(
within_bounds_3d
(
iz_tnw
,
iy_tnw
,
ix_tnw
,
inp_D
,
inp_H
,
inp_W
))
{
*
out_ptr_NCDHW
+=
inp_ptr_NC
[
iz_tnw
*
inp_sD
+
iy_tnw
*
inp_sH
+
ix_tnw
*
inp_sW
]
*
tnw
;
}
if
(
within_bounds_3d
(
iz_tne
,
iy_tne
,
ix_tne
,
inp_D
,
inp_H
,
inp_W
))
{
*
out_ptr_NCDHW
+=
inp_ptr_NC
[
iz_tne
*
inp_sD
+
iy_tne
*
inp_sH
+
ix_tne
*
inp_sW
]
*
tne
;
}
if
(
within_bounds_3d
(
iz_tsw
,
iy_tsw
,
ix_tsw
,
inp_D
,
inp_H
,
inp_W
))
{
*
out_ptr_NCDHW
+=
inp_ptr_NC
[
iz_tsw
*
inp_sD
+
iy_tsw
*
inp_sH
+
ix_tsw
*
inp_sW
]
*
tsw
;
}
if
(
within_bounds_3d
(
iz_tse
,
iy_tse
,
ix_tse
,
inp_D
,
inp_H
,
inp_W
))
{
*
out_ptr_NCDHW
+=
inp_ptr_NC
[
iz_tse
*
inp_sD
+
iy_tse
*
inp_sH
+
ix_tse
*
inp_sW
]
*
tse
;
}
if
(
within_bounds_3d
(
iz_bnw
,
iy_bnw
,
ix_bnw
,
inp_D
,
inp_H
,
inp_W
))
{
*
out_ptr_NCDHW
+=
inp_ptr_NC
[
iz_bnw
*
inp_sD
+
iy_bnw
*
inp_sH
+
ix_bnw
*
inp_sW
]
*
bnw
;
}
if
(
within_bounds_3d
(
iz_bne
,
iy_bne
,
ix_bne
,
inp_D
,
inp_H
,
inp_W
))
{
*
out_ptr_NCDHW
+=
inp_ptr_NC
[
iz_bne
*
inp_sD
+
iy_bne
*
inp_sH
+
ix_bne
*
inp_sW
]
*
bne
;
}
if
(
within_bounds_3d
(
iz_bsw
,
iy_bsw
,
ix_bsw
,
inp_D
,
inp_H
,
inp_W
))
{
*
out_ptr_NCDHW
+=
inp_ptr_NC
[
iz_bsw
*
inp_sD
+
iy_bsw
*
inp_sH
+
ix_bsw
*
inp_sW
]
*
bsw
;
}
if
(
within_bounds_3d
(
iz_bse
,
iy_bse
,
ix_bse
,
inp_D
,
inp_H
,
inp_W
))
{
*
out_ptr_NCDHW
+=
inp_ptr_NC
[
iz_bse
*
inp_sD
+
iy_bse
*
inp_sH
+
ix_bse
*
inp_sW
]
*
bse
;
}
}
}
else
if
(
interpolation_mode
==
GridSamplerInterpolation
::
Nearest
)
{
int
ix_nearest
=
static_cast
<
int
>
(
::
round
(
ix
));
int
iy_nearest
=
static_cast
<
int
>
(
::
round
(
iy
));
int
iz_nearest
=
static_cast
<
int
>
(
::
round
(
iz
));
// assign nearest neighbor pixel value to output pixel
auto
inp_ptr_NC
=
input
+
n
*
inp_sN
;
auto
out_ptr_NCDHW
=
output
+
n
*
out_sN
+
d
*
out_sD
+
h
*
out_sH
+
w
*
out_sW
;
for
(
int
c
=
0
;
c
<
C
;
++
c
,
inp_ptr_NC
+=
inp_sC
,
out_ptr_NCDHW
+=
out_sC
)
{
if
(
within_bounds_3d
(
iz_nearest
,
iy_nearest
,
ix_nearest
,
inp_D
,
inp_H
,
inp_W
))
{
*
out_ptr_NCDHW
=
inp_ptr_NC
[
iz_nearest
*
inp_sD
+
iy_nearest
*
inp_sH
+
ix_nearest
*
inp_sW
];
}
else
{
*
out_ptr_NCDHW
=
static_cast
<
scalar_t
>
(
0
);
}
}
}
}
}
void
create_desc
(
const
int
*
dims
,
int
nb_dims
,
TensorDesc
&
desc
)
{
memcpy
(
&
desc
.
shape
[
0
],
dims
,
sizeof
(
int
)
*
nb_dims
);
desc
.
stride
[
nb_dims
-
1
]
=
1
;
for
(
int
i
=
nb_dims
-
2
;
i
>=
0
;
--
i
)
{
desc
.
stride
[
i
]
=
desc
.
stride
[
i
+
1
]
*
desc
.
shape
[
i
+
1
];
}
}
template
<
typename
T
>
void
grid_sample
(
T
*
output
,
const
T
*
input
,
const
T
*
grid
,
int
*
output_dims
,
int
*
input_dims
,
int
*
grid_dims
,
int
nb_dims
,
GridSamplerInterpolation
interp
,
GridSamplerPadding
padding
,
bool
align_corners
,
cudaStream_t
stream
)
{
TensorDesc
input_desc
;
create_desc
(
input_dims
,
nb_dims
,
input_desc
);
TensorDesc
output_desc
;
create_desc
(
output_dims
,
nb_dims
,
output_desc
);
TensorDesc
grid_desc
;
create_desc
(
grid_dims
,
nb_dims
,
grid_desc
);
int
count
=
1
;
for
(
int
i
=
0
;
i
<
nb_dims
;
++
i
)
{
if
(
i
==
1
)
{
continue
;
}
count
*=
output_desc
.
shape
[
i
];
}
if
(
nb_dims
==
4
)
{
grid_sampler_2d_kernel
<
T
><<<
GET_BLOCKS
(
count
),
THREADS_PER_BLOCK
,
0
,
stream
>>>
(
count
,
input
,
grid
,
output
,
input_desc
,
grid_desc
,
output_desc
,
interp
,
padding
,
align_corners
);
}
else
if
(
nb_dims
==
5
)
{
grid_sampler_3d_kernel
<
T
><<<
GET_BLOCKS
(
count
),
THREADS_PER_BLOCK
,
0
,
stream
>>>
(
count
,
input
,
grid
,
output
,
input_desc
,
grid_desc
,
output_desc
,
interp
,
padding
,
align_corners
);
}
else
{
printf
(
"input and grid dims should be 4 or 5
\n
"
);
}
}
template
void
grid_sample
<
float
>(
float
*
output
,
const
float
*
input
,
const
float
*
grid
,
int
*
output_dims
,
int
*
input_dims
,
int
*
grid_dims
,
int
nb_dims
,
GridSamplerInterpolation
interp
,
GridSamplerPadding
padding
,
bool
align_corners
,
cudaStream_t
stream
);
csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler_kernel.hpp
0 → 100644
View file @
546b4279
// Copyright (c) OpenMMLab. All rights reserved.
#ifndef TRT_GRID_SAMPLER_KERNEL_HPP
#define TRT_GRID_SAMPLER_KERNEL_HPP
#include <cuda_runtime.h>
enum
class
GridSamplerInterpolation
{
Bilinear
,
Nearest
};
enum
class
GridSamplerPadding
{
Zeros
,
Border
,
Reflection
};
template
<
typename
T
>
void
grid_sample
(
T
*
output
,
const
T
*
input
,
const
T
*
grid
,
int
*
output_dims
,
int
*
input_dims
,
int
*
grid_dims
,
int
nb_dims
,
GridSamplerInterpolation
interp
,
GridSamplerPadding
padding
,
bool
align_corners
,
cudaStream_t
stream
);
#endif // TRT_GRID_SAMPLER_KERNEL_HPP
csrc/mmdeploy/backend_ops/tensorrt/instance_norm/trt_instance_norm.cpp
0 → 100644
View file @
546b4279
// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
// Modified from:
// https://github.com/NVIDIA/TensorRT/blob/master/plugin/instanceNormalizationPlugin/instanceNormalizationPlugin.cpp
#include "trt_instance_norm.hpp"
#include <cuda_fp16.h>
#include <stdexcept>
#include "trt_serialize.hpp"
using
namespace
nvinfer1
;
namespace
mmdeploy
{
namespace
{
constexpr
const
char
*
PLUGIN_VERSION
{
"1"
};
constexpr
const
char
*
PLUGIN_NAME
{
"TRTInstanceNormalization"
};
}
// namespace
TRTInstanceNormalization
::
TRTInstanceNormalization
(
const
std
::
string
&
name
,
float
epsilon
)
:
TRTPluginBase
(
name
),
mEpsilon
(
epsilon
)
{}
TRTInstanceNormalization
::
TRTInstanceNormalization
(
const
std
::
string
&
name
,
void
const
*
serialData
,
size_t
serialLength
)
:
TRTPluginBase
(
name
)
{
deserialize_value
(
&
serialData
,
&
serialLength
,
&
mEpsilon
);
}
TRTInstanceNormalization
::~
TRTInstanceNormalization
()
{}
// TRTInstanceNormalization returns one output.
int
TRTInstanceNormalization
::
getNbOutputs
()
const
TRT_NOEXCEPT
{
return
1
;
}
DimsExprs
TRTInstanceNormalization
::
getOutputDimensions
(
int
outputIndex
,
const
nvinfer1
::
DimsExprs
*
inputs
,
int
nbInputs
,
nvinfer1
::
IExprBuilder
&
exprBuilder
)
TRT_NOEXCEPT
{
nvinfer1
::
DimsExprs
output
(
inputs
[
0
]);
return
output
;
}
size_t
TRTInstanceNormalization
::
getWorkspaceSize
(
const
nvinfer1
::
PluginTensorDesc
*
inputs
,
int
nbInputs
,
const
nvinfer1
::
PluginTensorDesc
*
outputs
,
int
nbOutputs
)
const
TRT_NOEXCEPT
{
int
n
=
inputs
[
0
].
dims
.
d
[
0
];
int
c
=
inputs
[
0
].
dims
.
d
[
1
];
int
elem_size
=
sizeof
(
float
);
return
getAlignedSize
(
n
*
c
*
elem_size
)
*
2
;
}
int
TRTInstanceNormalization
::
enqueue
(
const
nvinfer1
::
PluginTensorDesc
*
inputDesc
,
const
nvinfer1
::
PluginTensorDesc
*
outputDesc
,
const
void
*
const
*
inputs
,
void
*
const
*
outputs
,
void
*
workspace
,
cudaStream_t
stream
)
TRT_NOEXCEPT
{
nvinfer1
::
Dims
input_dims
=
inputDesc
[
0
].
dims
;
int
n
=
input_dims
.
d
[
0
];
int
c
=
input_dims
.
d
[
1
];
int
h
=
input_dims
.
d
[
2
];
int
w
=
input_dims
.
nbDims
>
3
?
input_dims
.
d
[
3
]
:
1
;
int
elem_size
=
sizeof
(
float
);
void
*
n_scales
=
(
void
*
)
workspace
;
void
*
n_bias
=
(
void
*
)((
char
*
)
workspace
+
getAlignedSize
(
n
*
c
*
elem_size
));
const
void
*
scales
=
(
const
void
*
)
inputs
[
1
];
const
void
*
bias
=
(
const
void
*
)
inputs
[
2
];
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
cudaMemcpyAsync
((
char
*
)
n_scales
+
i
*
c
*
elem_size
,
scales
,
c
*
elem_size
,
cudaMemcpyDeviceToDevice
,
stream
);
cudaMemcpyAsync
((
char
*
)
n_bias
+
i
*
c
*
elem_size
,
bias
,
c
*
elem_size
,
cudaMemcpyDeviceToDevice
,
stream
);
}
cudnnSetTensor4dDescriptor
(
_b_desc
,
CUDNN_TENSOR_NCHW
,
CUDNN_DATA_FLOAT
,
1
,
n
*
c
,
1
,
1
);
cudnnDataType_t
cudnn_dtype
{};
convert_trt2cudnn_dtype
(
inputDesc
[
0
].
type
,
&
cudnn_dtype
);
cudnnSetTensor4dDescriptor
(
_x_desc
,
CUDNN_TENSOR_NCHW
,
cudnn_dtype
,
1
,
n
*
c
,
h
,
w
);
cudnnSetTensor4dDescriptor
(
_y_desc
,
CUDNN_TENSOR_NCHW
,
cudnn_dtype
,
1
,
n
*
c
,
h
,
w
);
float
alpha
=
1
;
float
beta
=
0
;
void
const
*
x_ptr
=
inputs
[
0
];
void
*
y_ptr
=
outputs
[
0
];
cudnnSetStream
(
_cudnn_handle
,
stream
);
// Note: Use of CUDNN_BATCHNORM_SPATIAL_PERSISTENT can cause numerical
// overflows (NaNs) for fp32 data in some circumstances. The lower-
// performance CUDNN_BATCHNORM_SPATIAL should be used if this is not
// acceptable.
cudnnBatchNormalizationForwardTraining
(
_cudnn_handle
,
CUDNN_BATCHNORM_SPATIAL_PERSISTENT
,
&
alpha
,
&
beta
,
_x_desc
,
x_ptr
,
_y_desc
,
y_ptr
,
_b_desc
,
n_scales
,
n_bias
,
1.
,
nullptr
,
nullptr
,
mEpsilon
,
nullptr
,
nullptr
);
return
0
;
}
size_t
TRTInstanceNormalization
::
getSerializationSize
()
const
TRT_NOEXCEPT
{
return
serialized_size
(
mEpsilon
);
}
void
TRTInstanceNormalization
::
serialize
(
void
*
buffer
)
const
TRT_NOEXCEPT
{
serialize_value
(
&
buffer
,
mEpsilon
);
}
bool
TRTInstanceNormalization
::
supportsFormatCombination
(
int
pos
,
const
nvinfer1
::
PluginTensorDesc
*
ioDesc
,
int
nbInputs
,
int
nbOutputs
)
TRT_NOEXCEPT
{
switch
(
pos
)
{
case
0
:
case
3
:
return
((
ioDesc
[
pos
].
type
==
nvinfer1
::
DataType
::
kFLOAT
||
ioDesc
[
pos
].
type
==
nvinfer1
::
DataType
::
kHALF
)
&&
ioDesc
[
pos
].
format
==
nvinfer1
::
PluginFormat
::
kLINEAR
&&
ioDesc
[
pos
].
type
==
ioDesc
[
0
].
type
);
case
1
:
case
2
:
return
ioDesc
[
pos
].
type
==
nvinfer1
::
DataType
::
kFLOAT
&&
ioDesc
[
pos
].
format
==
nvinfer1
::
PluginFormat
::
kLINEAR
;
default:
return
false
;
}
return
false
;
}
const
char
*
TRTInstanceNormalization
::
getPluginType
()
const
TRT_NOEXCEPT
{
return
PLUGIN_NAME
;
}
const
char
*
TRTInstanceNormalization
::
getPluginVersion
()
const
TRT_NOEXCEPT
{
return
PLUGIN_VERSION
;
}
IPluginV2DynamicExt
*
TRTInstanceNormalization
::
clone
()
const
TRT_NOEXCEPT
{
auto
*
plugin
=
new
TRTInstanceNormalization
{
mLayerName
,
mEpsilon
};
plugin
->
setPluginNamespace
(
mPluginNamespace
.
c_str
());
return
plugin
;
}
nvinfer1
::
DataType
TRTInstanceNormalization
::
getOutputDataType
(
int
index
,
const
nvinfer1
::
DataType
*
inputTypes
,
int
nbInputs
)
const
TRT_NOEXCEPT
{
return
inputTypes
[
0
];
}
// Attach the plugin object to an execution context and grant the plugin the
// access to some context resource.
void
TRTInstanceNormalization
::
attachToContext
(
cudnnContext
*
cudnnContext
,
cublasContext
*
cublasContext
,
IGpuAllocator
*
gpuAllocator
)
TRT_NOEXCEPT
{
_cudnn_handle
=
cudnnContext
;
cudnnCreateTensorDescriptor
(
&
_b_desc
);
cudnnCreateTensorDescriptor
(
&
_x_desc
);
cudnnCreateTensorDescriptor
(
&
_y_desc
);
}
// Detach the plugin object from its execution context.
void
TRTInstanceNormalization
::
detachFromContext
()
TRT_NOEXCEPT
{
if
(
_y_desc
)
{
cudnnDestroyTensorDescriptor
(
_y_desc
);
_y_desc
=
nullptr
;
}
if
(
_x_desc
)
{
cudnnDestroyTensorDescriptor
(
_x_desc
);
_x_desc
=
nullptr
;
}
if
(
_b_desc
)
{
cudnnDestroyTensorDescriptor
(
_b_desc
);
_b_desc
=
nullptr
;
}
}
void
TRTInstanceNormalization
::
configurePlugin
(
const
nvinfer1
::
DynamicPluginTensorDesc
*
in
,
int
nbInputs
,
const
nvinfer1
::
DynamicPluginTensorDesc
*
out
,
int
nbOutputs
)
TRT_NOEXCEPT
{}
// TRTInstanceNormalizationCreator methods
TRTInstanceNormalizationCreator
::
TRTInstanceNormalizationCreator
()
{
mPluginAttributes
.
clear
();
mPluginAttributes
.
emplace_back
(
PluginField
(
"epsilon"
,
nullptr
,
PluginFieldType
::
kFLOAT32
,
1
));
mFC
.
nbFields
=
mPluginAttributes
.
size
();
mFC
.
fields
=
mPluginAttributes
.
data
();
}
const
char
*
TRTInstanceNormalizationCreator
::
getPluginName
()
const
TRT_NOEXCEPT
{
return
PLUGIN_NAME
;
}
const
char
*
TRTInstanceNormalizationCreator
::
getPluginVersion
()
const
TRT_NOEXCEPT
{
return
PLUGIN_VERSION
;
}
IPluginV2DynamicExt
*
TRTInstanceNormalizationCreator
::
createPlugin
(
const
char
*
name
,
const
nvinfer1
::
PluginFieldCollection
*
fc
)
TRT_NOEXCEPT
{
float
epsilon
=
1e-5
;
const
PluginField
*
fields
=
fc
->
fields
;
for
(
int
i
=
0
;
i
<
fc
->
nbFields
;
++
i
)
{
const
char
*
attrName
=
fields
[
i
].
name
;
if
(
!
strcmp
(
attrName
,
"epsilon"
))
{
epsilon
=
*
(
static_cast
<
const
float
*>
(
fields
[
i
].
data
));
}
}
TRTInstanceNormalization
*
obj
=
new
TRTInstanceNormalization
(
name
,
epsilon
);
obj
->
setPluginNamespace
(
mNamespace
.
c_str
());
return
obj
;
}
IPluginV2DynamicExt
*
TRTInstanceNormalizationCreator
::
deserializePlugin
(
const
char
*
name
,
const
void
*
serialData
,
size_t
serialLength
)
TRT_NOEXCEPT
{
TRTInstanceNormalization
*
obj
=
new
TRTInstanceNormalization
{
name
,
serialData
,
serialLength
};
obj
->
setPluginNamespace
(
mNamespace
.
c_str
());
return
obj
;
}
REGISTER_TENSORRT_PLUGIN
(
TRTInstanceNormalizationCreator
);
}
// namespace mmdeploy
csrc/mmdeploy/backend_ops/tensorrt/instance_norm/trt_instance_norm.hpp
0 → 100644
View file @
546b4279
// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
// Modified from:
// https://github.com/NVIDIA/TensorRT/blob/master/plugin/instanceNormalizationPlugin/instanceNormalizationPlugin.h
#ifndef TRT_INSTANCE_NORMALIZATION_HPP
#define TRT_INSTANCE_NORMALIZATION_HPP
#include <cudnn.h>
#include <iostream>
#include <string>
#include <vector>
#include "trt_plugin_base.hpp"
typedef
unsigned
short
half_type
;
namespace
mmdeploy
{
class
TRTInstanceNormalization
final
:
public
TRTPluginBase
{
public:
TRTInstanceNormalization
(
const
std
::
string
&
name
,
float
epsilon
);
TRTInstanceNormalization
(
const
std
::
string
&
name
,
void
const
*
serialData
,
size_t
serialLength
);
TRTInstanceNormalization
()
=
delete
;
~
TRTInstanceNormalization
()
TRT_NOEXCEPT
override
;
int
getNbOutputs
()
const
TRT_NOEXCEPT
override
;
// DynamicExt plugins returns DimsExprs class instead of Dims
nvinfer1
::
DimsExprs
getOutputDimensions
(
int
outputIndex
,
const
nvinfer1
::
DimsExprs
*
inputs
,
int
nbInputs
,
nvinfer1
::
IExprBuilder
&
exprBuilder
)
TRT_NOEXCEPT
override
;
size_t
getWorkspaceSize
(
const
nvinfer1
::
PluginTensorDesc
*
inputs
,
int
nbInputs
,
const
nvinfer1
::
PluginTensorDesc
*
outputs
,
int
nbOutputs
)
const
TRT_NOEXCEPT
override
;
int
enqueue
(
const
nvinfer1
::
PluginTensorDesc
*
inputDesc
,
const
nvinfer1
::
PluginTensorDesc
*
outputDesc
,
const
void
*
const
*
inputs
,
void
*
const
*
outputs
,
void
*
workspace
,
cudaStream_t
stream
)
TRT_NOEXCEPT
override
;
size_t
getSerializationSize
()
const
TRT_NOEXCEPT
override
;
void
serialize
(
void
*
buffer
)
const
TRT_NOEXCEPT
override
;
// DynamicExt plugin supportsFormat update.
bool
supportsFormatCombination
(
int
pos
,
const
nvinfer1
::
PluginTensorDesc
*
ioDesc
,
int
nbInputs
,
int
nbOutputs
)
TRT_NOEXCEPT
override
;
const
char
*
getPluginType
()
const
TRT_NOEXCEPT
override
;
const
char
*
getPluginVersion
()
const
TRT_NOEXCEPT
override
;
nvinfer1
::
IPluginV2DynamicExt
*
clone
()
const
TRT_NOEXCEPT
override
;
nvinfer1
::
DataType
getOutputDataType
(
int
index
,
const
nvinfer1
::
DataType
*
inputTypes
,
int
nbInputs
)
const
TRT_NOEXCEPT
override
;
void
attachToContext
(
cudnnContext
*
cudnn
,
cublasContext
*
cublas
,
nvinfer1
::
IGpuAllocator
*
allocator
)
TRT_NOEXCEPT
override
;
void
detachFromContext
()
TRT_NOEXCEPT
override
;
void
configurePlugin
(
const
nvinfer1
::
DynamicPluginTensorDesc
*
in
,
int
nbInputs
,
const
nvinfer1
::
DynamicPluginTensorDesc
*
out
,
int
nbOutputs
)
TRT_NOEXCEPT
override
;
private:
float
mEpsilon
{};
cudnnHandle_t
_cudnn_handle
{};
cudnnTensorDescriptor_t
_x_desc
{},
_y_desc
{},
_b_desc
{};
std
::
string
mPluginNamespace
{};
};
class
TRTInstanceNormalizationCreator
:
public
TRTPluginCreatorBase
{
public:
TRTInstanceNormalizationCreator
();
~
TRTInstanceNormalizationCreator
()
override
=
default
;
const
char
*
getPluginName
()
const
TRT_NOEXCEPT
override
;
const
char
*
getPluginVersion
()
const
TRT_NOEXCEPT
override
;
nvinfer1
::
IPluginV2DynamicExt
*
createPlugin
(
const
char
*
name
,
const
nvinfer1
::
PluginFieldCollection
*
fc
)
TRT_NOEXCEPT
override
;
nvinfer1
::
IPluginV2DynamicExt
*
deserializePlugin
(
const
char
*
name
,
const
void
*
serialData
,
size_t
serialLength
)
TRT_NOEXCEPT
override
;
};
}
// namespace mmdeploy
#endif // TRT_INSTANCE_NORMALIZATION_HPP
csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv.cpp
0 → 100644
View file @
546b4279
// Copyright (c) OpenMMLab. All rights reserved
#include "trt_modulated_deform_conv.hpp"
#include <assert.h>
#include <chrono>
#include "trt_modulated_deform_conv_kernel.hpp"
#include "trt_serialize.hpp"
using
namespace
nvinfer1
;
namespace
mmdeploy
{
namespace
{
static
const
char
*
PLUGIN_VERSION
{
"1"
};
static
const
char
*
PLUGIN_NAME
{
"MMCVModulatedDeformConv2d"
};
}
// namespace
ModulatedDeformableConvPluginDynamic
::
ModulatedDeformableConvPluginDynamic
(
const
std
::
string
&
name
,
const
nvinfer1
::
Dims
stride
,
const
nvinfer1
::
Dims
padding
,
const
nvinfer1
::
Dims
dilation
,
const
int
deformableGroup
,
const
int
group
)
:
TRTPluginBase
(
name
),
mStride
(
stride
),
mPadding
(
padding
),
mDilation
(
dilation
),
mDeformableGroup
(
deformableGroup
),
mGroup
(
group
)
{
mWithBias
=
false
;
}
ModulatedDeformableConvPluginDynamic
::
ModulatedDeformableConvPluginDynamic
(
const
std
::
string
name
,
const
void
*
data
,
size_t
length
)
:
TRTPluginBase
(
name
)
{
deserialize_value
(
&
data
,
&
length
,
&
mStride
);
deserialize_value
(
&
data
,
&
length
,
&
mPadding
);
deserialize_value
(
&
data
,
&
length
,
&
mDilation
);
deserialize_value
(
&
data
,
&
length
,
&
mDeformableGroup
);
deserialize_value
(
&
data
,
&
length
,
&
mGroup
);
mWithBias
=
false
;
}
ModulatedDeformableConvPluginDynamic
::~
ModulatedDeformableConvPluginDynamic
()
{}
nvinfer1
::
IPluginV2DynamicExt
*
ModulatedDeformableConvPluginDynamic
::
clone
()
const
TRT_NOEXCEPT
{
ModulatedDeformableConvPluginDynamic
*
plugin
=
new
ModulatedDeformableConvPluginDynamic
(
mLayerName
,
mStride
,
mPadding
,
mDilation
,
mDeformableGroup
,
mGroup
);
plugin
->
setPluginNamespace
(
getPluginNamespace
());
return
plugin
;
}
static
const
nvinfer1
::
IDimensionExpr
*
get_hw
(
const
nvinfer1
::
IDimensionExpr
*
input
,
const
nvinfer1
::
IDimensionExpr
*
weight
,
const
nvinfer1
::
IDimensionExpr
*
stride
,
const
nvinfer1
::
IDimensionExpr
*
pad
,
const
nvinfer1
::
IDimensionExpr
*
dilation
,
nvinfer1
::
IExprBuilder
&
exprBuilder
)
{
using
DimOp
=
nvinfer1
::
DimensionOperation
;
auto
expr_1
=
exprBuilder
.
constant
(
1
);
// d*(w-1)+1
auto
kernel_0
=
exprBuilder
.
operation
(
DimOp
::
kSUB
,
*
weight
,
*
expr_1
);
auto
kernel_1
=
exprBuilder
.
operation
(
DimOp
::
kPROD
,
*
dilation
,
*
kernel_0
);
auto
kernel
=
exprBuilder
.
operation
(
DimOp
::
kSUM
,
*
kernel_1
,
*
expr_1
);
// (1+2*p-k)//stride -1
auto
out_0
=
exprBuilder
.
operation
(
DimOp
::
kSUM
,
*
pad
,
*
pad
);
auto
out_1
=
exprBuilder
.
operation
(
DimOp
::
kSUM
,
*
input
,
*
out_0
);
auto
out_2
=
exprBuilder
.
operation
(
DimOp
::
kSUB
,
*
out_1
,
*
kernel
);
auto
out_3
=
exprBuilder
.
operation
(
DimOp
::
kFLOOR_DIV
,
*
out_2
,
*
stride
);
auto
out
=
exprBuilder
.
operation
(
DimOp
::
kSUM
,
*
out_3
,
*
expr_1
);
return
out
;
}
nvinfer1
::
DimsExprs
ModulatedDeformableConvPluginDynamic
::
getOutputDimensions
(
int
outputIndex
,
const
nvinfer1
::
DimsExprs
*
inputs
,
int
nbInputs
,
nvinfer1
::
IExprBuilder
&
exprBuilder
)
TRT_NOEXCEPT
{
using
DimOp
=
nvinfer1
::
DimensionOperation
;
auto
weight_dim
=
inputs
[
3
].
d
;
nvinfer1
::
DimsExprs
ret
;
ret
.
nbDims
=
4
;
ret
.
d
[
0
]
=
inputs
[
0
].
d
[
0
];
ret
.
d
[
1
]
=
inputs
[
3
].
d
[
0
];
auto
input_h
=
inputs
[
0
].
d
[
2
];
auto
input_w
=
inputs
[
0
].
d
[
3
];
auto
weight_h
=
weight_dim
[
2
];
auto
weight_w
=
weight_dim
[
3
];
auto
dilation_w
=
exprBuilder
.
constant
(
mDilation
.
d
[
0
]);
auto
dilation_h
=
exprBuilder
.
constant
(
mDilation
.
d
[
1
]);
auto
pad_w
=
exprBuilder
.
constant
(
mPadding
.
d
[
0
]);
auto
pad_h
=
exprBuilder
.
constant
(
mPadding
.
d
[
1
]);
auto
stride_w
=
exprBuilder
.
constant
(
mStride
.
d
[
0
]);
auto
stride_h
=
exprBuilder
.
constant
(
mStride
.
d
[
1
]);
auto
expr_1
=
exprBuilder
.
constant
(
1
);
auto
expr_2
=
exprBuilder
.
constant
(
2
);
ret
.
d
[
2
]
=
get_hw
(
input_h
,
weight_h
,
stride_h
,
pad_h
,
dilation_h
,
exprBuilder
);
ret
.
d
[
3
]
=
get_hw
(
input_w
,
weight_w
,
stride_w
,
pad_w
,
dilation_w
,
exprBuilder
);
return
ret
;
}
bool
ModulatedDeformableConvPluginDynamic
::
supportsFormatCombination
(
int
pos
,
const
nvinfer1
::
PluginTensorDesc
*
ioDesc
,
int
nbInputs
,
int
nbOutputs
)
TRT_NOEXCEPT
{
if
(
pos
==
0
)
{
return
((
ioDesc
[
pos
].
type
==
nvinfer1
::
DataType
::
kFLOAT
||
ioDesc
[
pos
].
type
==
nvinfer1
::
DataType
::
kHALF
)
&&
ioDesc
[
pos
].
format
==
nvinfer1
::
TensorFormat
::
kLINEAR
);
}
else
{
return
ioDesc
[
pos
].
type
==
ioDesc
[
0
].
type
&&
ioDesc
[
pos
].
format
==
ioDesc
[
0
].
format
;
}
}
void
ModulatedDeformableConvPluginDynamic
::
configurePlugin
(
const
nvinfer1
::
DynamicPluginTensorDesc
*
inputs
,
int
nbInputs
,
const
nvinfer1
::
DynamicPluginTensorDesc
*
outputs
,
int
nbOutputs
)
TRT_NOEXCEPT
{
if
(
nbInputs
==
5
)
{
mWithBias
=
true
;
}
}
size_t
ModulatedDeformableConvPluginDynamic
::
getWorkspaceSize
(
const
nvinfer1
::
PluginTensorDesc
*
inputs
,
int
nbInputs
,
const
nvinfer1
::
PluginTensorDesc
*
outputs
,
int
nbOutputs
)
const
TRT_NOEXCEPT
{
int
sizeof_dtype
=
mmdeploy
::
getElementSize
(
outputs
[
0
].
type
);
int
batch_size
=
inputs
[
0
].
dims
.
d
[
0
];
int
nInputPlane
=
inputs
[
0
].
dims
.
d
[
1
];
int
inputHeight
=
inputs
[
0
].
dims
.
d
[
2
];
int
inputWidth
=
inputs
[
0
].
dims
.
d
[
3
];
int
nOutputPlane
=
outputs
[
0
].
dims
.
d
[
1
];
int
outputHeight
=
outputs
[
0
].
dims
.
d
[
2
];
int
outputWidth
=
outputs
[
0
].
dims
.
d
[
3
];
int
kW
=
inputs
[
3
].
dims
.
d
[
2
];
int
kH
=
inputs
[
3
].
dims
.
d
[
3
];
int
im2col_step
=
std
::
min
(
32
,
batch_size
);
size_t
col_size
=
mmdeploy
::
getAlignedSize
(
nInputPlane
*
kW
*
kH
*
outputHeight
*
outputWidth
*
sizeof_dtype
);
return
col_size
;
}
int
ModulatedDeformableConvPluginDynamic
::
enqueue
(
const
nvinfer1
::
PluginTensorDesc
*
inputDesc
,
const
nvinfer1
::
PluginTensorDesc
*
outputDesc
,
const
void
*
const
*
inputs
,
void
*
const
*
outputs
,
void
*
workSpace
,
cudaStream_t
stream
)
TRT_NOEXCEPT
{
int
batch
=
inputDesc
[
0
].
dims
.
d
[
0
];
int
channels
=
inputDesc
[
0
].
dims
.
d
[
1
];
int
height
=
inputDesc
[
0
].
dims
.
d
[
2
];
int
width
=
inputDesc
[
0
].
dims
.
d
[
3
];
int
channels_out
=
outputDesc
[
0
].
dims
.
d
[
1
];
int
kernel_h
=
inputDesc
[
3
].
dims
.
d
[
2
];
int
kernel_w
=
inputDesc
[
3
].
dims
.
d
[
3
];
const
void
*
x
=
inputs
[
0
];
const
void
*
offset
=
inputs
[
1
];
const
void
*
mask
=
inputs
[
2
];
const
void
*
weight
=
inputs
[
3
];
const
void
*
bias
=
mWithBias
?
inputs
[
4
]
:
nullptr
;
void
*
output
=
outputs
[
0
];
int
im2col_step
=
std
::
min
(
batch
,
32
);
// TODO: add fp16 support
auto
data_type
=
inputDesc
[
0
].
type
;
switch
(
data_type
)
{
case
nvinfer1
::
DataType
::
kFLOAT
:
ModulatedDeformConvForwardCUDAKernelLauncher
<
float
>
(
(
float
*
)
x
,
(
float
*
)
weight
,
(
float
*
)
bias
,
(
float
*
)
offset
,
(
float
*
)
mask
,
(
float
*
)
output
,
workSpace
,
batch
,
channels
,
height
,
width
,
channels_out
,
kernel_w
,
kernel_h
,
mStride
.
d
[
0
],
mStride
.
d
[
1
],
mPadding
.
d
[
0
],
mPadding
.
d
[
1
],
mDilation
.
d
[
0
],
mDilation
.
d
[
1
],
mGroup
,
mDeformableGroup
,
im2col_step
,
m_cublas_handle
,
stream
);
break
;
case
nvinfer1
::
DataType
::
kHALF
:
ModulatedDeformConvForwardCUDAKernelLauncher
<
half
>
(
(
half
*
)
x
,
(
half
*
)
weight
,
(
half
*
)
bias
,
(
half
*
)
offset
,
(
half
*
)
mask
,
(
half
*
)
output
,
workSpace
,
batch
,
channels
,
height
,
width
,
channels_out
,
kernel_w
,
kernel_h
,
mStride
.
d
[
0
],
mStride
.
d
[
1
],
mPadding
.
d
[
0
],
mPadding
.
d
[
1
],
mDilation
.
d
[
0
],
mDilation
.
d
[
1
],
mGroup
,
mDeformableGroup
,
im2col_step
,
m_cublas_handle
,
stream
);
break
;
default:
return
1
;
break
;
}
return
0
;
}
nvinfer1
::
DataType
ModulatedDeformableConvPluginDynamic
::
getOutputDataType
(
int
index
,
const
nvinfer1
::
DataType
*
inputTypes
,
int
nbInputs
)
const
TRT_NOEXCEPT
{
return
inputTypes
[
0
];
}
// IPluginV2 Methods
const
char
*
ModulatedDeformableConvPluginDynamic
::
getPluginType
()
const
TRT_NOEXCEPT
{
return
PLUGIN_NAME
;
}
const
char
*
ModulatedDeformableConvPluginDynamic
::
getPluginVersion
()
const
TRT_NOEXCEPT
{
return
PLUGIN_VERSION
;
}
int
ModulatedDeformableConvPluginDynamic
::
getNbOutputs
()
const
TRT_NOEXCEPT
{
return
1
;
}
size_t
ModulatedDeformableConvPluginDynamic
::
getSerializationSize
()
const
TRT_NOEXCEPT
{
return
serialized_size
(
mStride
)
+
serialized_size
(
mPadding
)
+
serialized_size
(
mDilation
)
+
serialized_size
(
mDeformableGroup
)
+
serialized_size
(
mGroup
);
}
void
ModulatedDeformableConvPluginDynamic
::
serialize
(
void
*
buffer
)
const
TRT_NOEXCEPT
{
serialize_value
(
&
buffer
,
mStride
);
serialize_value
(
&
buffer
,
mPadding
);
serialize_value
(
&
buffer
,
mDilation
);
serialize_value
(
&
buffer
,
mDeformableGroup
);
serialize_value
(
&
buffer
,
mGroup
);
}
void
ModulatedDeformableConvPluginDynamic
::
attachToContext
(
cudnnContext
*
cudnnContext
,
cublasContext
*
cublasContext
,
nvinfer1
::
IGpuAllocator
*
gpuAllocator
)
TRT_NOEXCEPT
{
m_cublas_handle
=
cublasContext
;
}
void
ModulatedDeformableConvPluginDynamic
::
detachFromContext
()
TRT_NOEXCEPT
{}
////////////////////// creator /////////////////////////////
ModulatedDeformableConvPluginDynamicCreator
::
ModulatedDeformableConvPluginDynamicCreator
()
{
mPluginAttributes
.
clear
();
mPluginAttributes
.
emplace_back
(
nvinfer1
::
PluginField
(
"stride"
));
mPluginAttributes
.
emplace_back
(
nvinfer1
::
PluginField
(
"padding"
));
mPluginAttributes
.
emplace_back
(
nvinfer1
::
PluginField
(
"dilation"
));
mPluginAttributes
.
emplace_back
(
nvinfer1
::
PluginField
(
"groups"
));
mPluginAttributes
.
emplace_back
(
nvinfer1
::
PluginField
(
"deform_groups"
));
mFC
.
nbFields
=
mPluginAttributes
.
size
();
mFC
.
fields
=
mPluginAttributes
.
data
();
}
const
char
*
ModulatedDeformableConvPluginDynamicCreator
::
getPluginName
()
const
TRT_NOEXCEPT
{
return
PLUGIN_NAME
;
}
const
char
*
ModulatedDeformableConvPluginDynamicCreator
::
getPluginVersion
()
const
TRT_NOEXCEPT
{
return
PLUGIN_VERSION
;
}
nvinfer1
::
IPluginV2
*
ModulatedDeformableConvPluginDynamicCreator
::
createPlugin
(
const
char
*
name
,
const
nvinfer1
::
PluginFieldCollection
*
fc
)
TRT_NOEXCEPT
{
nvinfer1
::
Dims
stride
{
2
,
{
1
,
1
}};
nvinfer1
::
Dims
padding
{
2
,
{
0
,
0
}};
nvinfer1
::
Dims
dilation
{
2
,
{
1
,
1
}};
int
deformableGroup
=
1
;
int
group
=
1
;
for
(
int
i
=
0
;
i
<
fc
->
nbFields
;
i
++
)
{
if
(
fc
->
fields
[
i
].
data
==
nullptr
)
{
continue
;
}
std
::
string
field_name
(
fc
->
fields
[
i
].
name
);
if
(
field_name
.
compare
(
"deform_groups"
)
==
0
)
{
deformableGroup
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
0
];
}
if
(
field_name
.
compare
(
"groups"
)
==
0
)
{
group
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
0
];
}
if
(
field_name
.
compare
(
"stride"
)
==
0
)
{
stride
.
nbDims
=
2
;
stride
.
d
[
0
]
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
0
];
stride
.
d
[
1
]
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
1
];
}
if
(
field_name
.
compare
(
"padding"
)
==
0
)
{
padding
.
nbDims
=
2
;
padding
.
d
[
0
]
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
0
];
padding
.
d
[
1
]
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
1
];
}
if
(
field_name
.
compare
(
"dilation"
)
==
0
)
{
dilation
.
nbDims
=
2
;
dilation
.
d
[
0
]
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
0
];
dilation
.
d
[
1
]
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
1
];
}
}
ModulatedDeformableConvPluginDynamic
*
plugin
=
new
ModulatedDeformableConvPluginDynamic
(
name
,
stride
,
padding
,
dilation
,
deformableGroup
,
group
);
plugin
->
setPluginNamespace
(
getPluginNamespace
());
return
plugin
;
}
nvinfer1
::
IPluginV2
*
ModulatedDeformableConvPluginDynamicCreator
::
deserializePlugin
(
const
char
*
name
,
const
void
*
serialData
,
size_t
serialLength
)
TRT_NOEXCEPT
{
auto
plugin
=
new
ModulatedDeformableConvPluginDynamic
(
name
,
serialData
,
serialLength
);
plugin
->
setPluginNamespace
(
getPluginNamespace
());
return
plugin
;
}
REGISTER_TENSORRT_PLUGIN
(
ModulatedDeformableConvPluginDynamicCreator
);
}
// namespace mmdeploy
csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv.hpp
0 → 100644
View file @
546b4279
// Copyright (c) OpenMMLab. All rights reserved.
#ifndef TRT_MODULATED_DEFORM_CONV_HPP
#define TRT_MODULATED_DEFORM_CONV_HPP
#include <cublas_v2.h>
#include <memory>
#include <string>
#include <vector>
#include "trt_plugin_base.hpp"
namespace
mmdeploy
{
class
ModulatedDeformableConvPluginDynamic
:
public
TRTPluginBase
{
public:
ModulatedDeformableConvPluginDynamic
(
const
std
::
string
&
name
,
const
nvinfer1
::
Dims
stride
,
const
nvinfer1
::
Dims
padding
,
const
nvinfer1
::
Dims
dilation
,
const
int
deformableGroup
,
const
int
group
);
ModulatedDeformableConvPluginDynamic
(
const
std
::
string
name
,
const
void
*
data
,
size_t
length
);
ModulatedDeformableConvPluginDynamic
()
=
delete
;
~
ModulatedDeformableConvPluginDynamic
()
TRT_NOEXCEPT
override
;
// IPluginV2DynamicExt Methods
nvinfer1
::
IPluginV2DynamicExt
*
clone
()
const
TRT_NOEXCEPT
override
;
nvinfer1
::
DimsExprs
getOutputDimensions
(
int
outputIndex
,
const
nvinfer1
::
DimsExprs
*
inputs
,
int
nbInputs
,
nvinfer1
::
IExprBuilder
&
exprBuilder
)
TRT_NOEXCEPT
override
;
bool
supportsFormatCombination
(
int
pos
,
const
nvinfer1
::
PluginTensorDesc
*
ioDesc
,
int
nbInputs
,
int
nbOutputs
)
TRT_NOEXCEPT
override
;
void
configurePlugin
(
const
nvinfer1
::
DynamicPluginTensorDesc
*
in
,
int
nbInputs
,
const
nvinfer1
::
DynamicPluginTensorDesc
*
out
,
int
nbOutputs
)
TRT_NOEXCEPT
override
;
size_t
getWorkspaceSize
(
const
nvinfer1
::
PluginTensorDesc
*
inputs
,
int
nbInputs
,
const
nvinfer1
::
PluginTensorDesc
*
outputs
,
int
nbOutputs
)
const
TRT_NOEXCEPT
override
;
int
enqueue
(
const
nvinfer1
::
PluginTensorDesc
*
inputDesc
,
const
nvinfer1
::
PluginTensorDesc
*
outputDesc
,
const
void
*
const
*
inputs
,
void
*
const
*
outputs
,
void
*
workspace
,
cudaStream_t
stream
)
TRT_NOEXCEPT
override
;
void
attachToContext
(
cudnnContext
*
cudnnContext
,
cublasContext
*
cublasContext
,
nvinfer1
::
IGpuAllocator
*
gpuAllocator
)
TRT_NOEXCEPT
override
;
void
detachFromContext
()
TRT_NOEXCEPT
override
;
// IPluginV2Ext Methods
nvinfer1
::
DataType
getOutputDataType
(
int
index
,
const
nvinfer1
::
DataType
*
inputTypes
,
int
nbInputs
)
const
TRT_NOEXCEPT
override
;
// IPluginV2 Methods
const
char
*
getPluginType
()
const
TRT_NOEXCEPT
override
;
const
char
*
getPluginVersion
()
const
TRT_NOEXCEPT
override
;
int
getNbOutputs
()
const
TRT_NOEXCEPT
override
;
size_t
getSerializationSize
()
const
TRT_NOEXCEPT
override
;
void
serialize
(
void
*
buffer
)
const
TRT_NOEXCEPT
override
;
private:
nvinfer1
::
Dims
mStride
;
nvinfer1
::
Dims
mPadding
;
nvinfer1
::
Dims
mDilation
;
int
mDeformableGroup
;
int
mGroup
;
bool
mWithBias
;
cublasHandle_t
m_cublas_handle
;
};
class
ModulatedDeformableConvPluginDynamicCreator
:
public
TRTPluginCreatorBase
{
public:
ModulatedDeformableConvPluginDynamicCreator
();
const
char
*
getPluginName
()
const
TRT_NOEXCEPT
override
;
const
char
*
getPluginVersion
()
const
TRT_NOEXCEPT
override
;
nvinfer1
::
IPluginV2
*
createPlugin
(
const
char
*
name
,
const
nvinfer1
::
PluginFieldCollection
*
fc
)
TRT_NOEXCEPT
override
;
nvinfer1
::
IPluginV2
*
deserializePlugin
(
const
char
*
name
,
const
void
*
serialData
,
size_t
serialLength
)
TRT_NOEXCEPT
override
;
};
}
// namespace mmdeploy
#endif // TRT_MODULATED_DEFORM_CONV_HPP
csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv_kernel.cu
0 → 100644
View file @
546b4279
// Copyright (c) OpenMMLab. All rights reserved
#include <assert.h>
#include <cuda_fp16.h>
#include "common_cuda_helper.hpp"
#include "modulated_deform_conv/modulated_deform_conv_cuda.cuh"
#include "trt_modulated_deform_conv_kernel.hpp"
#include "trt_plugin_helper.hpp"
template
<
typename
T
>
void
trt_modulated_deformable_im2col
(
const
T
*
data_im_
,
const
T
*
data_offset_
,
const
T
*
data_mask_
,
const
int
batch_size
,
const
int
channels
,
const
int
height_im
,
const
int
width_im
,
const
int
height_col
,
const
int
width_col
,
const
int
kernel_h
,
const
int
kenerl_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
deformable_group
,
T
*
data_col_
,
cudaStream_t
stream
)
{
// num_axes should be smaller than block size
const
int
channel_per_deformable_group
=
channels
/
deformable_group
;
const
int
num_kernels
=
channels
*
batch_size
*
height_col
*
width_col
;
modulated_deformable_im2col_gpu_kernel
<
T
>
<<<
GET_BLOCKS
(
num_kernels
),
THREADS_PER_BLOCK
,
0
,
stream
>>>
(
num_kernels
,
data_im_
,
data_offset_
,
data_mask_
,
height_im
,
width_im
,
kernel_h
,
kenerl_w
,
pad_h
,
pad_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
channel_per_deformable_group
,
batch_size
,
channels
,
deformable_group
,
height_col
,
width_col
,
data_col_
);
cudaCheckError
();
}
template
<
typename
scalar_t
>
__global__
void
output_add_bias_kernel
(
scalar_t
*
output
,
const
scalar_t
*
bias
,
size_t
step_batch
,
size_t
step_channel
,
size_t
n
)
{
CUDA_1D_KERNEL_LOOP
(
index
,
n
)
{
output
[
index
]
+=
bias
[(
index
%
step_batch
)
/
step_channel
];
}
}
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
template
<
>
__global__
void
output_add_bias_kernel
<
__half
>
(
__half
*
output
,
const
__half
*
bias
,
size_t
step_batch
,
size_t
step_channel
,
size_t
n
)
{
CUDA_1D_KERNEL_LOOP
(
index
,
n
)
{
const
__half
b
=
bias
[(
index
%
step_batch
)
/
step_channel
];
const
__half
o
=
output
[
index
];
output
[
index
]
=
__hadd
(
o
,
b
);
}
}
#else
template
<
>
__global__
void
output_add_bias_kernel
<
__half
>
(
__half
*
output
,
const
__half
*
bias
,
size_t
step_batch
,
size_t
step_channel
,
size_t
n
)
{
CUDA_1D_KERNEL_LOOP
(
index
,
n
)
{
const
__half
b
=
bias
[(
index
%
step_batch
)
/
step_channel
];
const
__half
o
=
output
[
index
];
output
[
index
]
=
__float2half
(
__half2float
(
o
)
+
__half2float
(
b
));
}
}
#endif
template
<
typename
scalar_t
>
static
void
output_add_bias
(
scalar_t
*
output
,
const
scalar_t
*
bias
,
size_t
batch
,
size_t
channel
,
size_t
height
,
size_t
width
,
cudaStream_t
stream
)
{
size_t
step_channel
=
height
*
width
;
size_t
step_batch
=
step_channel
*
channel
;
size_t
n
=
step_batch
*
batch
;
output_add_bias_kernel
<<<
GET_BLOCKS
(
n
),
THREADS_PER_BLOCK
,
0
,
stream
>>>
(
output
,
bias
,
step_batch
,
step_channel
,
n
);
}
template
<
typename
scalar_t
>
void
ModulatedDeformConvForwardCUDAKernelLauncher
(
const
scalar_t
*
input
,
const
scalar_t
*
weight
,
const
scalar_t
*
bias
,
const
scalar_t
*
offset
,
const
scalar_t
*
mask
,
scalar_t
*
output
,
void
*
workspace
,
int
batch
,
int
channels
,
int
height
,
int
width
,
int
channels_out
,
int
kernel_w
,
int
kernel_h
,
int
stride_w
,
int
stride_h
,
int
pad_w
,
int
pad_h
,
int
dilation_w
,
int
dilation_h
,
int
group
,
int
deformable_group
,
int
im2col_step
,
cublasHandle_t
cublas_handle
,
cudaStream_t
stream
)
{
bool
with_bias
=
(
bias
!=
nullptr
);
im2col_step
=
std
::
min
(
int
(
batch
),
im2col_step
);
assert
(
batch
%
im2col_step
==
0
);
const
int
height_out
=
(
height
+
2
*
pad_h
-
(
dilation_h
*
(
kernel_h
-
1
)
+
1
))
/
stride_h
+
1
;
const
int
width_out
=
(
width
+
2
*
pad_w
-
(
dilation_w
*
(
kernel_w
-
1
)
+
1
))
/
stride_w
+
1
;
scalar_t
*
columns
=
(
scalar_t
*
)
workspace
;
const
size_t
input_step
=
channels
*
height
*
width
;
const
size_t
offset_step
=
deformable_group
*
kernel_h
*
kernel_w
*
2
*
height_out
*
width_out
;
const
size_t
mask_step
=
deformable_group
*
kernel_h
*
kernel_w
*
height_out
*
width_out
;
const
size_t
out_step
=
channels_out
*
height_out
*
width_out
;
const
size_t
out_group_step
=
out_step
/
group
;
const
size_t
col_g_step
=
channels
*
kernel_w
*
kernel_h
/
group
*
height_out
*
width_out
;
const
size_t
weight_g_step
=
channels_out
/
group
*
channels
/
group
*
kernel_h
*
kernel_w
;
const
int
m
=
channels_out
/
group
;
const
int
n
=
height_out
*
width_out
;
const
int
k
=
channels
/
group
*
kernel_h
*
kernel_w
;
scalar_t
alpha
=
1.
;
scalar_t
beta
=
0.
;
for
(
int
b
=
0
;
b
<
batch
;
b
++
)
{
const
scalar_t
*
input_start
=
input
+
b
*
input_step
;
const
scalar_t
*
offset_start
=
offset
+
b
*
offset_step
;
const
scalar_t
*
mask_start
=
mask
+
b
*
mask_step
;
trt_modulated_deformable_im2col
<
scalar_t
>
(
input_start
,
offset_start
,
mask_start
,
1
,
channels
,
height
,
width
,
height_out
,
width_out
,
kernel_h
,
kernel_w
,
pad_h
,
pad_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
deformable_group
,
columns
,
stream
);
for
(
int
g
=
0
;
g
<
group
;
g
++
)
{
const
scalar_t
*
weight_start
=
weight
+
g
*
weight_g_step
;
scalar_t
*
col_start
=
columns
+
g
*
col_g_step
;
scalar_t
*
out_buffer_start
=
output
+
b
*
out_step
+
g
*
out_group_step
;
cublasGemmWrap
<
scalar_t
>
(
cublas_handle
,
CUBLAS_OP_N
,
CUBLAS_OP_N
,
n
,
m
,
k
,
&
alpha
,
col_start
,
n
,
weight_start
,
k
,
&
beta
,
out_buffer_start
,
n
);
cudaCheckError
();
}
}
if
(
with_bias
)
{
output_add_bias
<
scalar_t
>
(
output
,
bias
,
batch
,
channels_out
,
height_out
,
width_out
,
stream
);
}
}
template
void
ModulatedDeformConvForwardCUDAKernelLauncher
<
float
>(
const
float
*
input
,
const
float
*
weight
,
const
float
*
bias
,
const
float
*
offset
,
const
float
*
mask
,
float
*
output
,
void
*
workspace
,
int
batch
,
int
channels
,
int
height
,
int
width
,
int
channels_out
,
int
kernel_w
,
int
kernel_h
,
int
stride_w
,
int
stride_h
,
int
pad_w
,
int
pad_h
,
int
dilation_w
,
int
dilation_h
,
int
group
,
int
deformable_group
,
int
im2col_step
,
cublasHandle_t
cublas_handle
,
cudaStream_t
stream
);
template
void
ModulatedDeformConvForwardCUDAKernelLauncher
<
__half
>(
const
__half
*
input
,
const
__half
*
weight
,
const
__half
*
bias
,
const
__half
*
offset
,
const
__half
*
mask
,
__half
*
output
,
void
*
workspace
,
int
batch
,
int
channels
,
int
height
,
int
width
,
int
channels_out
,
int
kernel_w
,
int
kernel_h
,
int
stride_w
,
int
stride_h
,
int
pad_w
,
int
pad_h
,
int
dilation_w
,
int
dilation_h
,
int
group
,
int
deformable_group
,
int
im2col_step
,
cublasHandle_t
cublas_handle
,
cudaStream_t
stream
);
csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv_kernel.hpp
0 → 100644
View file @
546b4279
// Copyright (c) OpenMMLab. All rights reserved
#ifndef TRT_MODULATED_DEFORM_CONV_KERNEL_HPP
#define TRT_MODULATED_DEFORM_CONV_KERNEL_HPP
#include <cublas_v2.h>
#include <cuda_runtime.h>
template
<
typename
scalar_t
>
void
ModulatedDeformConvForwardCUDAKernelLauncher
(
const
scalar_t
*
input
,
const
scalar_t
*
weight
,
const
scalar_t
*
bias
,
const
scalar_t
*
offset
,
const
scalar_t
*
mask
,
scalar_t
*
output
,
void
*
workspace
,
int
batch
,
int
channels
,
int
height
,
int
width
,
int
channels_out
,
int
kernel_w
,
int
kernel_h
,
int
stride_w
,
int
stride_h
,
int
pad_w
,
int
pad_h
,
int
dilation_w
,
int
dilation_h
,
int
group
,
int
deformable_group
,
int
im2col_step
,
cublasHandle_t
cublas_handle
,
cudaStream_t
stream
);
#endif
csrc/mmdeploy/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align.cpp
0 → 100644
View file @
546b4279
// Copyright (c) OpenMMLab. All rights reserved.
#include "trt_multi_level_roi_align.hpp"
#include <assert.h>
#include <chrono>
#include "trt_multi_level_roi_align_kernel.hpp"
#include "trt_plugin_helper.hpp"
#include "trt_serialize.hpp"
namespace
mmdeploy
{
namespace
{
static
const
char
*
PLUGIN_VERSION
{
"1"
};
static
const
char
*
PLUGIN_NAME
{
"MMCVMultiLevelRoiAlign"
};
}
// namespace
TRTMultiLevelRoiAlign
::
TRTMultiLevelRoiAlign
(
const
std
::
string
&
name
,
int
alignedHeight
,
int
alignedWidth
,
int
poolMode
,
int
sampleNum
,
const
std
::
vector
<
float
>
&
featmapStrides
,
float
roiScaleFactor
,
int
finestScale
,
bool
aligned
)
:
TRTPluginBase
(
name
),
mAlignedHeight
(
alignedHeight
),
mAlignedWidth
(
alignedWidth
),
mPoolMode
(
poolMode
),
mSampleNum
(
sampleNum
),
mFeatmapStrides
(
featmapStrides
),
mRoiScaleFactor
(
roiScaleFactor
),
mFinestScale
(
finestScale
),
mAligned
(
aligned
)
{}
TRTMultiLevelRoiAlign
::
TRTMultiLevelRoiAlign
(
const
std
::
string
name
,
const
void
*
data
,
size_t
length
)
:
TRTPluginBase
(
name
)
{
deserialize_value
(
&
data
,
&
length
,
&
mAlignedHeight
);
deserialize_value
(
&
data
,
&
length
,
&
mAlignedWidth
);
deserialize_value
(
&
data
,
&
length
,
&
mPoolMode
);
deserialize_value
(
&
data
,
&
length
,
&
mSampleNum
);
deserialize_value
(
&
data
,
&
length
,
&
mRoiScaleFactor
);
deserialize_value
(
&
data
,
&
length
,
&
mFinestScale
);
deserialize_value
(
&
data
,
&
length
,
&
mAligned
);
deserialize_value
(
&
data
,
&
length
,
&
mFeatmapStrides
);
}
nvinfer1
::
IPluginV2DynamicExt
*
TRTMultiLevelRoiAlign
::
clone
()
const
TRT_NOEXCEPT
{
TRTMultiLevelRoiAlign
*
plugin
=
new
TRTMultiLevelRoiAlign
(
mLayerName
,
mAlignedHeight
,
mAlignedWidth
,
mPoolMode
,
mSampleNum
,
mFeatmapStrides
,
mRoiScaleFactor
,
mFinestScale
,
mAligned
);
plugin
->
setPluginNamespace
(
getPluginNamespace
());
return
plugin
;
}
nvinfer1
::
DimsExprs
TRTMultiLevelRoiAlign
::
getOutputDimensions
(
int
outputIndex
,
const
nvinfer1
::
DimsExprs
*
inputs
,
int
nbInputs
,
nvinfer1
::
IExprBuilder
&
exprBuilder
)
TRT_NOEXCEPT
{
// warning, nbInputs should equal to mFeatmapStrides.size() + 1
nvinfer1
::
DimsExprs
ret
;
ret
.
nbDims
=
4
;
ret
.
d
[
0
]
=
inputs
[
0
].
d
[
0
];
ret
.
d
[
1
]
=
inputs
[
1
].
d
[
1
];
ret
.
d
[
2
]
=
exprBuilder
.
constant
(
mAlignedHeight
);
ret
.
d
[
3
]
=
exprBuilder
.
constant
(
mAlignedWidth
);
return
ret
;
}
bool
TRTMultiLevelRoiAlign
::
supportsFormatCombination
(
int
pos
,
const
nvinfer1
::
PluginTensorDesc
*
ioDesc
,
int
nbInputs
,
int
nbOutputs
)
TRT_NOEXCEPT
{
return
ioDesc
[
pos
].
type
==
nvinfer1
::
DataType
::
kFLOAT
&&
ioDesc
[
pos
].
format
==
nvinfer1
::
TensorFormat
::
kLINEAR
;
}
void
TRTMultiLevelRoiAlign
::
configurePlugin
(
const
nvinfer1
::
DynamicPluginTensorDesc
*
inputs
,
int
nbInputs
,
const
nvinfer1
::
DynamicPluginTensorDesc
*
outputs
,
int
nbOutputs
)
TRT_NOEXCEPT
{
// Validate input arguments
ASSERT
(
nbOutputs
==
1
);
ASSERT
(
nbInputs
>=
1
);
mFeatmapStrides
=
std
::
vector
<
float
>
(
mFeatmapStrides
.
begin
(),
mFeatmapStrides
.
begin
()
+
(
nbInputs
-
1
));
}
size_t
TRTMultiLevelRoiAlign
::
getWorkspaceSize
(
const
nvinfer1
::
PluginTensorDesc
*
inputs
,
int
nbInputs
,
const
nvinfer1
::
PluginTensorDesc
*
outputs
,
int
nbOutputs
)
const
TRT_NOEXCEPT
{
return
0
;
}
int
TRTMultiLevelRoiAlign
::
enqueue
(
const
nvinfer1
::
PluginTensorDesc
*
inputDesc
,
const
nvinfer1
::
PluginTensorDesc
*
outputDesc
,
const
void
*
const
*
inputs
,
void
*
const
*
outputs
,
void
*
workSpace
,
cudaStream_t
stream
)
TRT_NOEXCEPT
{
int
num_rois
=
inputDesc
[
0
].
dims
.
d
[
0
];
int
batch_size
=
inputDesc
[
1
].
dims
.
d
[
0
];
int
channels
=
inputDesc
[
1
].
dims
.
d
[
1
];
const
int
kMaxFeatMap
=
10
;
int
heights
[
kMaxFeatMap
];
int
widths
[
kMaxFeatMap
];
float
strides
[
kMaxFeatMap
];
int
num_feats
=
mFeatmapStrides
.
size
();
for
(
int
i
=
0
;
i
<
num_feats
;
++
i
)
{
heights
[
i
]
=
inputDesc
[
i
+
1
].
dims
.
d
[
2
];
widths
[
i
]
=
inputDesc
[
i
+
1
].
dims
.
d
[
3
];
strides
[
i
]
=
mFeatmapStrides
[
i
];
}
const
void
*
rois
=
inputs
[
0
];
const
void
*
const
*
feats
=
inputs
+
1
;
multi_level_roi_align
<
float
>
((
float
*
)
outputs
[
0
],
(
const
float
*
)
rois
,
num_rois
,
feats
,
num_feats
,
batch_size
,
channels
,
&
heights
[
0
],
&
widths
[
0
],
&
strides
[
0
],
mAlignedHeight
,
mAlignedWidth
,
mPoolMode
,
mSampleNum
,
mRoiScaleFactor
,
mFinestScale
,
mAligned
,
stream
);
return
0
;
}
nvinfer1
::
DataType
TRTMultiLevelRoiAlign
::
getOutputDataType
(
int
index
,
const
nvinfer1
::
DataType
*
inputTypes
,
int
nbInputs
)
const
TRT_NOEXCEPT
{
return
nvinfer1
::
DataType
::
kFLOAT
;
}
// IPluginV2 Methods
const
char
*
TRTMultiLevelRoiAlign
::
getPluginType
()
const
TRT_NOEXCEPT
{
return
PLUGIN_NAME
;
}
const
char
*
TRTMultiLevelRoiAlign
::
getPluginVersion
()
const
TRT_NOEXCEPT
{
return
PLUGIN_VERSION
;
}
int
TRTMultiLevelRoiAlign
::
getNbOutputs
()
const
TRT_NOEXCEPT
{
return
1
;
}
size_t
TRTMultiLevelRoiAlign
::
getSerializationSize
()
const
TRT_NOEXCEPT
{
return
serialized_size
(
mFeatmapStrides
)
+
serialized_size
(
mAlignedHeight
)
+
serialized_size
(
mAlignedWidth
)
+
serialized_size
(
mPoolMode
)
+
serialized_size
(
mSampleNum
)
+
serialized_size
(
mRoiScaleFactor
)
+
serialized_size
(
mFinestScale
)
+
serialized_size
(
mAligned
);
}
void
TRTMultiLevelRoiAlign
::
serialize
(
void
*
buffer
)
const
TRT_NOEXCEPT
{
serialize_value
(
&
buffer
,
mAlignedHeight
);
serialize_value
(
&
buffer
,
mAlignedWidth
);
serialize_value
(
&
buffer
,
mPoolMode
);
serialize_value
(
&
buffer
,
mSampleNum
);
serialize_value
(
&
buffer
,
mRoiScaleFactor
);
serialize_value
(
&
buffer
,
mFinestScale
);
serialize_value
(
&
buffer
,
mAligned
);
serialize_value
(
&
buffer
,
mFeatmapStrides
);
}
TRTMultiLevelRoiAlignCreator
::
TRTMultiLevelRoiAlignCreator
()
{
mPluginAttributes
=
std
::
vector
<
nvinfer1
::
PluginField
>
(
{
nvinfer1
::
PluginField
(
"output_height"
),
nvinfer1
::
PluginField
(
"output_width"
),
nvinfer1
::
PluginField
(
"pool_mode"
),
nvinfer1
::
PluginField
(
"sampling_ratio"
),
nvinfer1
::
PluginField
(
"featmap_strides"
),
nvinfer1
::
PluginField
(
"roi_scale_factor"
),
nvinfer1
::
PluginField
(
"finest_scale"
),
nvinfer1
::
PluginField
(
"aligned"
)});
mFC
.
nbFields
=
mPluginAttributes
.
size
();
mFC
.
fields
=
mPluginAttributes
.
data
();
}
const
char
*
TRTMultiLevelRoiAlignCreator
::
getPluginName
()
const
TRT_NOEXCEPT
{
return
PLUGIN_NAME
;
}
const
char
*
TRTMultiLevelRoiAlignCreator
::
getPluginVersion
()
const
TRT_NOEXCEPT
{
return
PLUGIN_VERSION
;
}
nvinfer1
::
IPluginV2
*
TRTMultiLevelRoiAlignCreator
::
createPlugin
(
const
char
*
name
,
const
nvinfer1
::
PluginFieldCollection
*
fc
)
TRT_NOEXCEPT
{
int
alignedHeight
=
7
;
int
alignedWidth
=
7
;
int
poolMode
=
0
;
int
sampleNum
=
2
;
std
::
vector
<
float
>
featmapStrides
;
float
roiScaleFactor
=
-
1
;
int
finestScale
=
56
;
bool
aligned
=
false
;
for
(
int
i
=
0
;
i
<
fc
->
nbFields
;
i
++
)
{
if
(
fc
->
fields
[
i
].
data
==
nullptr
)
{
continue
;
}
std
::
string
field_name
(
fc
->
fields
[
i
].
name
);
if
(
field_name
.
compare
(
"output_height"
)
==
0
)
{
alignedHeight
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
0
];
}
else
if
(
field_name
.
compare
(
"output_width"
)
==
0
)
{
alignedWidth
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
0
];
}
else
if
(
field_name
.
compare
(
"pool_mode"
)
==
0
)
{
poolMode
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
0
];
}
else
if
(
field_name
.
compare
(
"sampling_ratio"
)
==
0
)
{
sampleNum
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
0
];
}
else
if
(
field_name
.
compare
(
"roi_scale_factor"
)
==
0
)
{
roiScaleFactor
=
static_cast
<
const
float
*>
(
fc
->
fields
[
i
].
data
)[
0
];
}
else
if
(
field_name
.
compare
(
"finest_scale"
)
==
0
)
{
finestScale
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
0
];
}
else
if
(
field_name
.
compare
(
"featmap_strides"
)
==
0
)
{
int
data_size
=
(
fc
->
fields
[
i
].
length
);
const
float
*
data_start
=
static_cast
<
const
float
*>
(
fc
->
fields
[
i
].
data
);
featmapStrides
=
std
::
vector
<
float
>
(
data_start
,
data_start
+
data_size
);
}
else
if
(
field_name
.
compare
(
"aligned"
)
==
0
)
{
int
aligned_int
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
0
];
aligned
=
aligned_int
!=
0
;
}
}
ASSERT
(
featmapStrides
.
size
()
!=
0
);
TRTMultiLevelRoiAlign
*
plugin
=
new
TRTMultiLevelRoiAlign
(
name
,
alignedHeight
,
alignedWidth
,
poolMode
,
sampleNum
,
featmapStrides
,
roiScaleFactor
,
finestScale
,
aligned
);
plugin
->
setPluginNamespace
(
getPluginNamespace
());
return
plugin
;
}
nvinfer1
::
IPluginV2
*
TRTMultiLevelRoiAlignCreator
::
deserializePlugin
(
const
char
*
name
,
const
void
*
serialData
,
size_t
serialLength
)
TRT_NOEXCEPT
{
auto
plugin
=
new
TRTMultiLevelRoiAlign
(
name
,
serialData
,
serialLength
);
plugin
->
setPluginNamespace
(
getPluginNamespace
());
return
plugin
;
}
REGISTER_TENSORRT_PLUGIN
(
TRTMultiLevelRoiAlignCreator
);
}
// namespace mmdeploy
csrc/mmdeploy/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align.hpp
0 → 100644
View file @
546b4279
// Copyright (c) OpenMMLab. All rights reserved.
#ifndef TRT_MULTI_LEVEL_ROI_ALIGN_HPP
#define TRT_MULTI_LEVEL_ROI_ALIGN_HPP
#include <cublas_v2.h>
#include <memory>
#include <string>
#include <vector>
#include "trt_plugin_base.hpp"
namespace
mmdeploy
{
class
TRTMultiLevelRoiAlign
:
public
TRTPluginBase
{
public:
TRTMultiLevelRoiAlign
(
const
std
::
string
&
name
,
int
alignedHeight
,
int
alignedWidth
,
int
poolMode
,
int
sampleNum
,
const
std
::
vector
<
float
>
&
featmapStrides
,
float
roiScaleFactor
=
-
1
,
int
finestScale
=
56
,
bool
aligned
=
false
);
TRTMultiLevelRoiAlign
(
const
std
::
string
name
,
const
void
*
data
,
size_t
length
);
TRTMultiLevelRoiAlign
()
=
delete
;
// IPluginV2DynamicExt Methods
nvinfer1
::
IPluginV2DynamicExt
*
clone
()
const
TRT_NOEXCEPT
override
;
nvinfer1
::
DimsExprs
getOutputDimensions
(
int
outputIndex
,
const
nvinfer1
::
DimsExprs
*
inputs
,
int
nbInputs
,
nvinfer1
::
IExprBuilder
&
exprBuilder
)
TRT_NOEXCEPT
override
;
bool
supportsFormatCombination
(
int
pos
,
const
nvinfer1
::
PluginTensorDesc
*
ioDesc
,
int
nbInputs
,
int
nbOutputs
)
TRT_NOEXCEPT
override
;
void
configurePlugin
(
const
nvinfer1
::
DynamicPluginTensorDesc
*
in
,
int
nbInputs
,
const
nvinfer1
::
DynamicPluginTensorDesc
*
out
,
int
nbOutputs
)
TRT_NOEXCEPT
override
;
size_t
getWorkspaceSize
(
const
nvinfer1
::
PluginTensorDesc
*
inputs
,
int
nbInputs
,
const
nvinfer1
::
PluginTensorDesc
*
outputs
,
int
nbOutputs
)
const
TRT_NOEXCEPT
override
;
int
enqueue
(
const
nvinfer1
::
PluginTensorDesc
*
inputDesc
,
const
nvinfer1
::
PluginTensorDesc
*
outputDesc
,
const
void
*
const
*
inputs
,
void
*
const
*
outputs
,
void
*
workspace
,
cudaStream_t
stream
)
TRT_NOEXCEPT
override
;
// IPluginV2Ext Methods
nvinfer1
::
DataType
getOutputDataType
(
int
index
,
const
nvinfer1
::
DataType
*
inputTypes
,
int
nbInputs
)
const
TRT_NOEXCEPT
override
;
// IPluginV2 Methods
const
char
*
getPluginType
()
const
TRT_NOEXCEPT
override
;
const
char
*
getPluginVersion
()
const
TRT_NOEXCEPT
override
;
int
getNbOutputs
()
const
TRT_NOEXCEPT
override
;
size_t
getSerializationSize
()
const
TRT_NOEXCEPT
override
;
void
serialize
(
void
*
buffer
)
const
TRT_NOEXCEPT
override
;
private:
int
mAlignedHeight
;
int
mAlignedWidth
;
int
mPoolMode
;
int
mSampleNum
;
std
::
vector
<
float
>
mFeatmapStrides
;
float
mRoiScaleFactor
;
int
mFinestScale
;
bool
mAligned
;
};
class
TRTMultiLevelRoiAlignCreator
:
public
TRTPluginCreatorBase
{
public:
TRTMultiLevelRoiAlignCreator
();
const
char
*
getPluginName
()
const
TRT_NOEXCEPT
override
;
const
char
*
getPluginVersion
()
const
TRT_NOEXCEPT
override
;
nvinfer1
::
IPluginV2
*
createPlugin
(
const
char
*
name
,
const
nvinfer1
::
PluginFieldCollection
*
fc
)
TRT_NOEXCEPT
override
;
nvinfer1
::
IPluginV2
*
deserializePlugin
(
const
char
*
name
,
const
void
*
serialData
,
size_t
serialLength
)
TRT_NOEXCEPT
override
;
};
}
// namespace mmdeploy
#endif // TRT_ROI_ALIGN_HPP
csrc/mmdeploy/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align_kernel.cu
0 → 100644
View file @
546b4279
// Copyright (c) OpenMMLab. All rights reserved.
#include <float.h>
#include <stdio.h>
#include <algorithm>
#include <cmath>
#include "common_cuda_helper.hpp"
#include "trt_multi_level_roi_align_kernel.hpp"
#include "trt_plugin_helper.hpp"
const
int
kMAX_FEATMAP_SIZE
=
10
;
struct
FeatData
{
const
void
*
data
[
kMAX_FEATMAP_SIZE
];
int
batch_size
;
int
channels
;
int
h
[
kMAX_FEATMAP_SIZE
];
int
w
[
kMAX_FEATMAP_SIZE
];
float
spatial_scale
[
kMAX_FEATMAP_SIZE
];
int
num_featmap
;
};
template
<
typename
scalar_t
,
bool
aligned
,
int
pool_mode
>
__device__
scalar_t
roi_align_single
(
const
scalar_t
*
__restrict__
bottom_data
,
const
int
roi_batch_ind
,
const
scalar_t
roi_start_w
,
const
scalar_t
roi_start_h
,
const
scalar_t
roi_end_w
,
const
scalar_t
roi_end_h
,
const
scalar_t
spatial_scale
,
const
int
pw
,
const
int
ph
,
const
int
c
,
const
int
sample_num
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
pooled_height
,
const
int
pooled_width
)
{
// Force malformed ROIs to be 1x1
scalar_t
roi_width
=
max
(
roi_end_w
-
roi_start_w
,
(
scalar_t
)(
aligned
?
0.
:
1.
));
scalar_t
roi_height
=
max
(
roi_end_h
-
roi_start_h
,
(
scalar_t
)(
aligned
?
0.
:
1.
));
const
scalar_t
bin_size_h
=
roi_height
/
pooled_height
;
const
scalar_t
bin_size_w
=
roi_width
/
pooled_width
;
const
scalar_t
*
offset_bottom_data
=
bottom_data
+
(
roi_batch_ind
*
channels
+
c
)
*
height
*
width
;
const
int
sample_num_h
=
(
sample_num
>
0
)
?
sample_num
:
ceil
(
roi_height
/
pooled_height
);
const
int
sample_num_w
=
(
sample_num
>
0
)
?
sample_num
:
ceil
(
roi_width
/
pooled_width
);
scalar_t
output_val
=
(
pool_mode
==
0
)
?
-
FLT_MAX
:
0
;
const
scalar_t
y_offset
=
roi_start_h
+
ph
*
bin_size_h
;
const
scalar_t
y_scale
=
bin_size_h
/
(
scalar_t
)(
sample_num_h
);
const
scalar_t
x_offset
=
roi_start_w
+
pw
*
bin_size_w
;
const
scalar_t
x_scale
=
bin_size_w
/
(
scalar_t
)(
sample_num_w
);
for
(
int
iy
=
0
;
iy
<
sample_num_h
;
iy
++
)
{
const
scalar_t
y
=
fma
(
scalar_t
(
iy
)
+
scalar_t
(
.5
f
),
y_scale
,
y_offset
);
for
(
int
ix
=
0
;
ix
<
sample_num_w
;
ix
++
)
{
const
scalar_t
x
=
fma
(
scalar_t
(
ix
)
+
scalar_t
(
.5
f
),
x_scale
,
x_offset
);
scalar_t
val
=
bilinear_interpolate
<
scalar_t
>
(
offset_bottom_data
,
height
,
width
,
y
,
x
);
if
(
pool_mode
==
0
)
{
output_val
=
max
(
output_val
,
val
);
}
else
{
output_val
+=
val
;
}
}
}
if
(
pool_mode
!=
0
)
{
output_val
/=
max
(
sample_num_h
*
sample_num_w
,
1
);
}
return
output_val
;
}
template
<
typename
scalar_t
,
bool
aligned
>
__global__
void
roi_extractor_kernel
(
scalar_t
*
__restrict__
output
,
const
scalar_t
*
__restrict__
bottom_rois
,
FeatData
feat_data
,
const
int
pool_mode
,
const
int
sample_num
,
const
float
roi_scale_factor
,
const
int
finest_scale
,
const
int
pooled_height
,
const
int
pooled_width
,
int
nThreads
)
{
CUDA_1D_KERNEL_LOOP
(
index
,
nThreads
)
{
const
int
channels
=
feat_data
.
channels
;
int
tmp_index
=
index
;
const
int
pw
=
tmp_index
%
pooled_width
;
tmp_index
/=
pooled_width
;
const
int
ph
=
tmp_index
%
pooled_height
;
tmp_index
/=
pooled_height
;
const
int
c
=
tmp_index
%
channels
;
const
int
n
=
tmp_index
/
channels
;
const
scalar_t
*
offset_bottom_rois
=
bottom_rois
+
n
*
5
;
scalar_t
roi_offset_x0
=
offset_bottom_rois
[
1
];
scalar_t
roi_offset_y0
=
offset_bottom_rois
[
2
];
scalar_t
roi_offset_x1
=
offset_bottom_rois
[
3
];
scalar_t
roi_offset_y1
=
offset_bottom_rois
[
4
];
const
scalar_t
scale
=
sqrtf
((
roi_offset_y1
-
roi_offset_y0
)
*
(
roi_offset_x1
-
roi_offset_x0
));
const
int
target_lvls
=
min
(
feat_data
.
num_featmap
-
1
,
max
(
0
,
int
(
floorf
(
log2f
(
scale
/
(
scalar_t
)(
finest_scale
)
+
1e-6
)))));
if
(
roi_scale_factor
>
0.
)
{
const
scalar_t
roi_off_cx
=
(
roi_offset_x0
+
roi_offset_x1
)
*
0.5
;
const
scalar_t
roi_off_cy
=
(
roi_offset_y0
+
roi_offset_y1
)
*
0.5
;
const
scalar_t
half_scale_factor
=
roi_scale_factor
*
0.5
;
const
scalar_t
half_roi_off_w
=
fma
(
roi_offset_x1
-
roi_offset_x0
+
1
,
half_scale_factor
,
scalar_t
(
-
0.5
));
const
scalar_t
half_roi_off_h
=
fma
(
roi_offset_y1
-
roi_offset_y0
+
1
,
half_scale_factor
,
scalar_t
(
-
0.5
));
roi_offset_x0
=
roi_off_cx
-
half_roi_off_w
;
roi_offset_x1
=
roi_off_cx
+
half_roi_off_w
;
roi_offset_y0
=
roi_off_cy
-
half_roi_off_h
;
roi_offset_y1
=
roi_off_cy
+
half_roi_off_h
;
}
const
scalar_t
spatial_scale
=
(
scalar_t
)
feat_data
.
spatial_scale
[
target_lvls
];
const
int
height
=
feat_data
.
h
[
target_lvls
];
const
int
width
=
feat_data
.
w
[
target_lvls
];
const
scalar_t
*
bottom_data
=
(
scalar_t
*
)
feat_data
.
data
[
target_lvls
];
const
int
roi_batch_ind
=
offset_bottom_rois
[
0
];
const
scalar_t
offset
=
aligned
?
(
scalar_t
)
-
0.5
:
(
scalar_t
)
0.0
;
const
scalar_t
roi_start_w
=
fma
(
roi_offset_x0
,
spatial_scale
,
offset
);
// roi_offset_x0 * spatial_scale + offset;
const
scalar_t
roi_start_h
=
fma
(
roi_offset_y0
,
spatial_scale
,
offset
);
// roi_offset_y0 * spatial_scale + offset;
const
scalar_t
roi_end_w
=
fma
(
roi_offset_x1
,
spatial_scale
,
offset
);
// (roi_offset_x1) * spatial_scale - offset;
const
scalar_t
roi_end_h
=
fma
(
roi_offset_y1
,
spatial_scale
,
offset
);
// (roi_offset_y1)*spatial_scale - offset;
if
(
pool_mode
==
0
)
{
const
scalar_t
output_val
=
roi_align_single
<
scalar_t
,
aligned
,
0
>
(
bottom_data
,
roi_batch_ind
,
roi_start_w
,
roi_start_h
,
roi_end_w
,
roi_end_h
,
spatial_scale
,
pw
,
ph
,
c
,
sample_num
,
channels
,
height
,
width
,
pooled_height
,
pooled_width
);
output
[
index
]
=
output_val
;
}
else
{
const
scalar_t
output_val
=
roi_align_single
<
scalar_t
,
aligned
,
1
>
(
bottom_data
,
roi_batch_ind
,
roi_start_w
,
roi_start_h
,
roi_end_w
,
roi_end_h
,
spatial_scale
,
pw
,
ph
,
c
,
sample_num
,
channels
,
height
,
width
,
pooled_height
,
pooled_width
);
output
[
index
]
=
output_val
;
}
}
}
template
<
typename
T
>
void
multi_level_roi_align
(
T
*
output
,
const
T
*
rois
,
int
num_rois
,
const
void
*
const
*
feats
,
int
num_feats
,
int
n
,
int
c
,
int
*
h
,
int
*
w
,
float
*
strides
,
int
aligned_height
,
int
aligned_width
,
int
pool_mode
,
int
sample_num
,
float
roi_scale_factor
,
int
finest_scale
,
bool
aligned
,
cudaStream_t
stream
)
{
FeatData
feat_data
;
feat_data
.
batch_size
=
n
;
feat_data
.
channels
=
c
;
feat_data
.
num_featmap
=
num_feats
;
for
(
int
i
=
0
;
i
<
num_feats
;
++
i
)
{
feat_data
.
data
[
i
]
=
feats
[
i
];
feat_data
.
h
[
i
]
=
h
[
i
];
feat_data
.
w
[
i
]
=
w
[
i
];
feat_data
.
spatial_scale
[
i
]
=
1.
/
float
(
strides
[
i
]);
}
int
nThreads
=
num_rois
*
c
*
aligned_height
*
aligned_width
;
if
(
aligned
)
{
roi_extractor_kernel
<
T
,
true
><<<
GET_BLOCKS
(
nThreads
),
THREADS_PER_BLOCK
,
0
,
stream
>>>
(
output
,
rois
,
feat_data
,
pool_mode
,
sample_num
,
roi_scale_factor
,
finest_scale
,
aligned_height
,
aligned_width
,
nThreads
);
}
else
{
roi_extractor_kernel
<
T
,
false
><<<
GET_BLOCKS
(
nThreads
),
THREADS_PER_BLOCK
,
0
,
stream
>>>
(
output
,
rois
,
feat_data
,
pool_mode
,
sample_num
,
roi_scale_factor
,
finest_scale
,
aligned_height
,
aligned_width
,
nThreads
);
}
}
template
void
multi_level_roi_align
<
float
>(
float
*
output
,
const
float
*
rois
,
int
num_rois
,
const
void
*
const
*
feats
,
int
num_feats
,
int
n
,
int
c
,
int
*
h
,
int
*
w
,
float
*
strides
,
int
aligned_height
,
int
aligned_width
,
int
pool_mode
,
int
sample_num
,
float
roi_scale_factor
,
int
finest_scale
,
bool
aligned
,
cudaStream_t
stream
);
Prev
1
…
8
9
10
11
12
13
14
15
16
…
23
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment