Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dcnv3
Commits
76b26f09
Commit
76b26f09
authored
Mar 18, 2023
by
zhe chen
Browse files
Upload tensorrt custom ops (#39)
parent
7e4b1c48
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
577 additions
and
0 deletions
+577
-0
tensorrt/modulated_deform_conv_v3/trt_deform_conv_v3.cpp
tensorrt/modulated_deform_conv_v3/trt_deform_conv_v3.cpp
+297
-0
tensorrt/modulated_deform_conv_v3/trt_deform_conv_v3.hpp
tensorrt/modulated_deform_conv_v3/trt_deform_conv_v3.hpp
+78
-0
tensorrt/modulated_deform_conv_v3/trt_deform_conv_v3_kernel.cu
...rrt/modulated_deform_conv_v3/trt_deform_conv_v3_kernel.cu
+185
-0
tensorrt/modulated_deform_conv_v3/trt_deform_conv_v3_kernel.hpp
...rt/modulated_deform_conv_v3/trt_deform_conv_v3_kernel.hpp
+17
-0
No files found.
tensorrt/modulated_deform_conv_v3/trt_deform_conv_v3.cpp
0 → 100644
View file @
76b26f09
// Copyright (c) OpenMMLab. All rights reserved
#include "trt_deform_conv_v3.hpp"
#include <assert.h>
#include <chrono>
#include "trt_deform_conv_v3_kernel.hpp"
#include "trt_plugin_helper.hpp"
#include "trt_serialize.hpp"
using
namespace
nvinfer1
;
namespace
mmdeploy
{
namespace
{
static
const
char
*
PLUGIN_VERSION
{
"1"
};
static
const
char
*
PLUGIN_NAME
{
"TRTDCNv3"
};
}
// namespace
TRTDCNv3
::
TRTDCNv3
(
const
std
::
string
&
name
,
int
kernel_h
,
int
kernel_w
,
int
stride_h
,
int
stride_w
,
int
pad_h
,
int
pad_w
,
int
dilation_h
,
int
dilation_w
,
int
group
,
int
group_channels
,
float
offset_scale
,
int
im2col_step
)
:
TRTPluginBase
(
name
),
kernel_h_
(
kernel_h
),
kernel_w_
(
kernel_w
),
stride_h_
(
stride_h
),
stride_w_
(
stride_w
),
pad_h_
(
pad_h
),
pad_w_
(
pad_w
),
dilation_h_
(
dilation_h
),
dilation_w_
(
dilation_w
),
group_
(
group
),
group_channels_
(
group_channels
),
offset_scale_
(
offset_scale
),
im2col_step_
(
im2col_step
)
{}
TRTDCNv3
::
TRTDCNv3
(
const
std
::
string
name
,
const
void
*
data
,
size_t
length
)
:
TRTPluginBase
(
name
)
{
deserialize_value
(
&
data
,
&
length
,
&
kernel_h_
);
deserialize_value
(
&
data
,
&
length
,
&
kernel_w_
);
deserialize_value
(
&
data
,
&
length
,
&
stride_h_
);
deserialize_value
(
&
data
,
&
length
,
&
stride_w_
);
deserialize_value
(
&
data
,
&
length
,
&
pad_h_
);
deserialize_value
(
&
data
,
&
length
,
&
pad_w_
);
deserialize_value
(
&
data
,
&
length
,
&
dilation_h_
);
deserialize_value
(
&
data
,
&
length
,
&
dilation_w_
);
deserialize_value
(
&
data
,
&
length
,
&
group_
);
deserialize_value
(
&
data
,
&
length
,
&
group_channels_
);
deserialize_value
(
&
data
,
&
length
,
&
offset_scale_
);
deserialize_value
(
&
data
,
&
length
,
&
im2col_step_
);
}
nvinfer1
::
IPluginV2DynamicExt
*
TRTDCNv3
::
clone
()
const
TRT_NOEXCEPT
{
TRTDCNv3
*
plugin
=
new
TRTDCNv3
(
mLayerName
,
kernel_h_
,
kernel_w_
,
stride_h_
,
stride_w_
,
pad_h_
,
pad_w_
,
dilation_h_
,
dilation_w_
,
group_
,
group_channels_
,
offset_scale_
,
im2col_step_
);
plugin
->
setPluginNamespace
(
getPluginNamespace
());
return
plugin
;
}
const
nvinfer1
::
IDimensionExpr
*
output_size
(
const
nvinfer1
::
IDimensionExpr
&
input
,
int
pad
,
int
dilation
,
int
kernel
,
int
stride
,
nvinfer1
::
IExprBuilder
&
exprBuilder
)
{
// out_expand = 2×padding[0]−dilation[0]×(kernel_size[0]−1)+1
auto
out_expand
=
exprBuilder
.
constant
(
2
*
pad
-
dilation
*
(
kernel
-
1
)
+
1
);
// out = out_expand + input
auto
out_before_div
=
exprBuilder
.
operation
(
DimensionOperation
::
kSUM
,
input
,
*
out_expand
);
// out = out / stride
auto
out_before_sub
=
exprBuilder
.
operation
(
DimensionOperation
::
kFLOOR_DIV
,
*
out_before_div
,
*
(
exprBuilder
.
constant
(
stride
)));
// out -=1
auto
out
=
exprBuilder
.
operation
(
DimensionOperation
::
kSUB
,
*
out_before_sub
,
*
(
exprBuilder
.
constant
(
1
)));
return
out
;
}
nvinfer1
::
DimsExprs
TRTDCNv3
::
getOutputDimensions
(
int
outputIndex
,
const
nvinfer1
::
DimsExprs
*
inputs
,
int
nbInputs
,
nvinfer1
::
IExprBuilder
&
exprBuilder
)
TRT_NOEXCEPT
{
nvinfer1
::
DimsExprs
ret
;
ret
.
nbDims
=
4
;
ret
.
d
[
0
]
=
inputs
[
0
].
d
[
0
];
ret
.
d
[
3
]
=
exprBuilder
.
constant
(
group_
*
group_channels_
);
ret
.
d
[
1
]
=
output_size
(
*
inputs
[
0
].
d
[
1
],
pad_h_
,
dilation_h_
,
kernel_h_
,
stride_h_
,
exprBuilder
);
ret
.
d
[
2
]
=
output_size
(
*
inputs
[
0
].
d
[
2
],
pad_w_
,
dilation_w_
,
kernel_w_
,
stride_w_
,
exprBuilder
);
return
ret
;
}
bool
TRTDCNv3
::
supportsFormatCombination
(
int
pos
,
const
nvinfer1
::
PluginTensorDesc
*
ioDesc
,
int
nbInputs
,
int
nbOutputs
)
TRT_NOEXCEPT
{
if
(
pos
==
0
)
{
return
(
ioDesc
[
pos
].
type
==
nvinfer1
::
DataType
::
kFLOAT
&&
ioDesc
[
pos
].
format
==
nvinfer1
::
TensorFormat
::
kLINEAR
);
}
else
{
return
ioDesc
[
pos
].
type
==
ioDesc
[
0
].
type
&&
ioDesc
[
pos
].
format
==
ioDesc
[
0
].
format
;
}
}
void
TRTDCNv3
::
configurePlugin
(
const
nvinfer1
::
DynamicPluginTensorDesc
*
inputs
,
int
nbInputs
,
const
nvinfer1
::
DynamicPluginTensorDesc
*
outputs
,
int
nbOutputs
)
TRT_NOEXCEPT
{}
size_t
TRTDCNv3
::
getWorkspaceSize
(
const
nvinfer1
::
PluginTensorDesc
*
inputs
,
int
nbInputs
,
const
nvinfer1
::
PluginTensorDesc
*
outputs
,
int
nbOutputs
)
const
TRT_NOEXCEPT
{
int
sizeof_dtype
=
mmdeploy
::
getElementSize
(
outputs
[
0
].
type
);
int
batch_size
=
inputs
[
0
].
dims
.
d
[
0
];
int
nInputPlane
=
inputs
[
0
].
dims
.
d
[
3
];
int
inputHeight
=
inputs
[
0
].
dims
.
d
[
1
];
int
inputWidth
=
inputs
[
0
].
dims
.
d
[
2
];
int
nOutputPlane
=
outputs
[
0
].
dims
.
d
[
3
];
int
outputHeight
=
outputs
[
0
].
dims
.
d
[
1
];
int
outputWidth
=
outputs
[
0
].
dims
.
d
[
2
];
int
kW
=
inputs
[
3
].
dims
.
d
[
1
];
int
kH
=
inputs
[
3
].
dims
.
d
[
2
];
int
im2col_step
=
std
::
min
(
32
,
batch_size
);
size_t
col_size
=
mmdeploy
::
getAlignedSize
(
nInputPlane
*
kW
*
kH
*
outputHeight
*
outputWidth
*
sizeof_dtype
);
return
col_size
;
}
int
TRTDCNv3
::
enqueue
(
const
nvinfer1
::
PluginTensorDesc
*
inputDesc
,
const
nvinfer1
::
PluginTensorDesc
*
outputDesc
,
const
void
*
const
*
inputs
,
void
*
const
*
outputs
,
void
*
workSpace
,
cudaStream_t
stream
)
TRT_NOEXCEPT
{
int
batch
=
inputDesc
[
0
].
dims
.
d
[
0
];
int
height
=
inputDesc
[
0
].
dims
.
d
[
1
];
int
width
=
inputDesc
[
0
].
dims
.
d
[
2
];
int
channels
=
inputDesc
[
0
].
dims
.
d
[
3
];
int
height_out
=
outputDesc
[
0
].
dims
.
d
[
1
];
int
width_out
=
outputDesc
[
0
].
dims
.
d
[
2
];
int
channels_out
=
outputDesc
[
0
].
dims
.
d
[
3
];
const
void
*
input
=
inputs
[
0
];
const
void
*
offset
=
inputs
[
1
];
const
void
*
mask
=
inputs
[
2
];
void
*
output
=
outputs
[
0
];
// TODO: add fp16 support
auto
data_type
=
inputDesc
[
0
].
type
;
switch
(
data_type
)
{
case
nvinfer1
::
DataType
::
kFLOAT
:
DeformConvv3ForwardCUDAKernelLauncher
<
float
>
(
(
float
*
)
input
,
(
float
*
)
offset
,
(
float
*
)
mask
,
(
float
*
)
output
,
workSpace
,
batch
,
channels
,
height
,
width
,
channels_out
,
kernel_w_
,
kernel_h_
,
stride_w_
,
stride_h_
,
pad_w_
,
pad_h_
,
dilation_w_
,
dilation_h_
,
group_
,
group_channels_
,
offset_scale_
,
im2col_step_
,
stream
);
break
;
default:
return
1
;
break
;
}
return
0
;
}
nvinfer1
::
DataType
TRTDCNv3
::
getOutputDataType
(
int
index
,
const
nvinfer1
::
DataType
*
inputTypes
,
int
nbInputs
)
const
TRT_NOEXCEPT
{
return
inputTypes
[
0
];
}
// IPluginV2 Methods
const
char
*
TRTDCNv3
::
getPluginType
()
const
TRT_NOEXCEPT
{
return
PLUGIN_NAME
;
}
const
char
*
TRTDCNv3
::
getPluginVersion
()
const
TRT_NOEXCEPT
{
return
PLUGIN_VERSION
;
}
int
TRTDCNv3
::
getNbOutputs
()
const
TRT_NOEXCEPT
{
return
1
;
}
size_t
TRTDCNv3
::
getSerializationSize
()
const
TRT_NOEXCEPT
{
return
serialized_size
(
kernel_h_
)
+
serialized_size
(
kernel_w_
)
+
serialized_size
(
stride_h_
)
+
serialized_size
(
stride_w_
)
+
serialized_size
(
pad_h_
)
+
serialized_size
(
pad_w_
)
+
serialized_size
(
dilation_h_
)
+
serialized_size
(
dilation_w_
)
+
serialized_size
(
group_
)
+
serialized_size
(
group_channels_
)
+
serialized_size
(
offset_scale_
)
+
serialized_size
(
im2col_step_
);
}
void
TRTDCNv3
::
serialize
(
void
*
buffer
)
const
TRT_NOEXCEPT
{
serialize_value
(
&
buffer
,
kernel_h_
);
serialize_value
(
&
buffer
,
kernel_w_
);
serialize_value
(
&
buffer
,
stride_h_
);
serialize_value
(
&
buffer
,
stride_w_
);
serialize_value
(
&
buffer
,
pad_h_
);
serialize_value
(
&
buffer
,
pad_w_
);
serialize_value
(
&
buffer
,
dilation_h_
);
serialize_value
(
&
buffer
,
dilation_w_
);
serialize_value
(
&
buffer
,
group_
);
serialize_value
(
&
buffer
,
group_channels_
);
serialize_value
(
&
buffer
,
offset_scale_
);
serialize_value
(
&
buffer
,
im2col_step_
);
}
////////////////////// creator /////////////////////////////
TRTDCNv3Creator
::
TRTDCNv3Creator
()
{
mPluginAttributes
.
clear
();
mPluginAttributes
.
emplace_back
(
nvinfer1
::
PluginField
(
"kernel_h"
));
mPluginAttributes
.
emplace_back
(
nvinfer1
::
PluginField
(
"kernel_w"
));
mPluginAttributes
.
emplace_back
(
nvinfer1
::
PluginField
(
"stride_h"
));
mPluginAttributes
.
emplace_back
(
nvinfer1
::
PluginField
(
"stride_w"
));
mPluginAttributes
.
emplace_back
(
nvinfer1
::
PluginField
(
"pad_h"
));
mPluginAttributes
.
emplace_back
(
nvinfer1
::
PluginField
(
"pad_w"
));
mPluginAttributes
.
emplace_back
(
nvinfer1
::
PluginField
(
"dilation_h"
));
mPluginAttributes
.
emplace_back
(
nvinfer1
::
PluginField
(
"dilation_w"
));
mPluginAttributes
.
emplace_back
(
nvinfer1
::
PluginField
(
"group"
));
mPluginAttributes
.
emplace_back
(
nvinfer1
::
PluginField
(
"group_channels"
));
mPluginAttributes
.
emplace_back
(
nvinfer1
::
PluginField
(
"offset_scale"
));
mPluginAttributes
.
emplace_back
(
nvinfer1
::
PluginField
(
"im2col_step"
));
mFC
.
nbFields
=
mPluginAttributes
.
size
();
mFC
.
fields
=
mPluginAttributes
.
data
();
}
const
char
*
TRTDCNv3Creator
::
getPluginName
()
const
TRT_NOEXCEPT
{
return
PLUGIN_NAME
;
}
const
char
*
TRTDCNv3Creator
::
getPluginVersion
()
const
TRT_NOEXCEPT
{
return
PLUGIN_VERSION
;
}
nvinfer1
::
IPluginV2
*
TRTDCNv3Creator
::
createPlugin
(
const
char
*
name
,
const
nvinfer1
::
PluginFieldCollection
*
fc
)
TRT_NOEXCEPT
{
nvinfer1
::
Dims
size
{
2
,
{
1
,
1
}};
int
kernel_h
=
3
;
int
kernel_w
=
3
;
int
stride_h
=
1
;
int
stride_w
=
1
;
int
pad_h
=
1
;
int
pad_w
=
1
;
int
dilation_h
=
1
;
int
dilation_w
=
1
;
int
group
=
28
;
int
group_channels
=
16
;
float
offset_scale
=
1
;
int
im2col_step
=
256
;
for
(
int
i
=
0
;
i
<
fc
->
nbFields
;
i
++
)
{
if
(
fc
->
fields
[
i
].
data
==
nullptr
)
{
continue
;
}
std
::
string
field_name
(
fc
->
fields
[
i
].
name
);
if
(
field_name
.
compare
(
"kernel_h"
)
==
0
)
{
kernel_h
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
0
];
}
if
(
field_name
.
compare
(
"kernel_w"
)
==
0
)
{
kernel_w
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
0
];
}
if
(
field_name
.
compare
(
"stride_h"
)
==
0
)
{
stride_h
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
0
];
}
if
(
field_name
.
compare
(
"stride_w"
)
==
0
)
{
stride_w
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
0
];
}
if
(
field_name
.
compare
(
"pad_h"
)
==
0
)
{
pad_h
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
0
];
}
if
(
field_name
.
compare
(
"pad_w"
)
==
0
)
{
pad_w
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
0
];
}
if
(
field_name
.
compare
(
"dilation_h"
)
==
0
)
{
dilation_h
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
0
];
}
if
(
field_name
.
compare
(
"dilation_w"
)
==
0
)
{
dilation_w
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
0
];
}
if
(
field_name
.
compare
(
"group"
)
==
0
)
{
group
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
0
];
}
if
(
field_name
.
compare
(
"group_channels"
)
==
0
)
{
group_channels
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
0
];
}
if
(
field_name
.
compare
(
"offset_scale"
)
==
0
)
{
offset_scale
=
static_cast
<
const
float
*>
(
fc
->
fields
[
i
].
data
)[
0
];
}
if
(
field_name
.
compare
(
"im2col_step"
)
==
0
)
{
im2col_step
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
0
];
}
}
TRTDCNv3
*
plugin
=
new
TRTDCNv3
(
name
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
offset_scale
,
im2col_step
);
plugin
->
setPluginNamespace
(
getPluginNamespace
());
return
plugin
;
}
nvinfer1
::
IPluginV2
*
TRTDCNv3Creator
::
deserializePlugin
(
const
char
*
name
,
const
void
*
serialData
,
size_t
serialLength
)
TRT_NOEXCEPT
{
auto
plugin
=
new
TRTDCNv3
(
name
,
serialData
,
serialLength
);
plugin
->
setPluginNamespace
(
getPluginNamespace
());
return
plugin
;
}
REGISTER_TENSORRT_PLUGIN
(
TRTDCNv3Creator
);
}
// namespace mmdeploy
tensorrt/modulated_deform_conv_v3/trt_deform_conv_v3.hpp
0 → 100644
View file @
76b26f09
#ifndef TRT_DEFORM_CONV_V3_HPP
#define TRT_DEFORM_CONV_V3_HPP
#include <cublas_v2.h>
#include <memory>
#include <string>
#include <vector>
#include "trt_plugin_base.hpp"
namespace
mmdeploy
{
class
TRTDCNv3
:
public
TRTPluginBase
{
public:
TRTDCNv3
(
const
std
::
string
&
name
,
int
kernel_h
,
int
kernel_w
,
int
stride_h
,
int
stride_w
,
int
pad_h
,
int
pad_w
,
int
dilation_h
,
int
dilation_w
,
int
group
,
int
group_channels
,
float
offset_scale
,
int
im2col_step
);
TRTDCNv3
(
const
std
::
string
name
,
const
void
*
data
,
size_t
length
);
TRTDCNv3
()
=
delete
;
// IPluginV2DynamicExt Methods
nvinfer1
::
IPluginV2DynamicExt
*
clone
()
const
TRT_NOEXCEPT
override
;
nvinfer1
::
DimsExprs
getOutputDimensions
(
int
outputIndex
,
const
nvinfer1
::
DimsExprs
*
inputs
,
int
nbInputs
,
nvinfer1
::
IExprBuilder
&
exprBuilder
)
TRT_NOEXCEPT
override
;
bool
supportsFormatCombination
(
int
pos
,
const
nvinfer1
::
PluginTensorDesc
*
ioDesc
,
int
nbInputs
,
int
nbOutputs
)
TRT_NOEXCEPT
override
;
void
configurePlugin
(
const
nvinfer1
::
DynamicPluginTensorDesc
*
in
,
int
nbInputs
,
const
nvinfer1
::
DynamicPluginTensorDesc
*
out
,
int
nbOutputs
)
TRT_NOEXCEPT
override
;
size_t
getWorkspaceSize
(
const
nvinfer1
::
PluginTensorDesc
*
inputs
,
int
nbInputs
,
const
nvinfer1
::
PluginTensorDesc
*
outputs
,
int
nbOutputs
)
const
TRT_NOEXCEPT
override
;
int
enqueue
(
const
nvinfer1
::
PluginTensorDesc
*
inputDesc
,
const
nvinfer1
::
PluginTensorDesc
*
outputDesc
,
const
void
*
const
*
inputs
,
void
*
const
*
outputs
,
void
*
workspace
,
cudaStream_t
stream
)
TRT_NOEXCEPT
override
;
// IPluginV2Ext Methods
nvinfer1
::
DataType
getOutputDataType
(
int
index
,
const
nvinfer1
::
DataType
*
inputTypes
,
int
nbInputs
)
const
TRT_NOEXCEPT
override
;
// IPluginV2 Methods
const
char
*
getPluginType
()
const
TRT_NOEXCEPT
override
;
const
char
*
getPluginVersion
()
const
TRT_NOEXCEPT
override
;
int
getNbOutputs
()
const
TRT_NOEXCEPT
override
;
size_t
getSerializationSize
()
const
TRT_NOEXCEPT
override
;
void
serialize
(
void
*
buffer
)
const
TRT_NOEXCEPT
override
;
private:
int
kernel_h_
;
int
kernel_w_
;
int
stride_h_
;
int
stride_w_
;
int
pad_h_
;
int
pad_w_
;
int
dilation_h_
;
int
dilation_w_
;
int
group_
;
int
group_channels_
;
float
offset_scale_
;
int
im2col_step_
;
};
class
TRTDCNv3Creator
:
public
TRTPluginCreatorBase
{
public:
TRTDCNv3Creator
();
const
char
*
getPluginName
()
const
TRT_NOEXCEPT
override
;
const
char
*
getPluginVersion
()
const
TRT_NOEXCEPT
override
;
nvinfer1
::
IPluginV2
*
createPlugin
(
const
char
*
name
,
const
nvinfer1
::
PluginFieldCollection
*
fc
)
TRT_NOEXCEPT
override
;
nvinfer1
::
IPluginV2
*
deserializePlugin
(
const
char
*
name
,
const
void
*
serialData
,
size_t
serialLength
)
TRT_NOEXCEPT
override
;
};
}
// namespace mmdeploy
#endif // TRT_DEFORM_CONV_V3_HPP
tensorrt/modulated_deform_conv_v3/trt_deform_conv_v3_kernel.cu
0 → 100644
View file @
76b26f09
// Modified from
// https://github.com/pytorch/pytorch/blob/6adbe044e39c8e8db158d91e151aa6dead6e9aa4/aten/src/ATen/native/cuda/UpSampleBicubic2d.cu
#include <cuda_fp16.h>
#include <stdio.h>
#include <algorithm>
#include <cmath>
#include <vector>
#include "common_cuda_helper.hpp"
#include "trt_deform_conv_v3_kernel.hpp"
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
const
int
CUDA_NUM_THREADS
=
256
;
inline
int
GET_BLOCKS
(
const
int
N
,
const
int
num_threads
)
{
return
(
N
+
num_threads
-
1
)
/
num_threads
;
}
template
<
typename
scalar_t
>
__device__
scalar_t
dcnv3_im2col_bilinear
(
const
scalar_t
*&
bottom_data
,
const
int
&
height
,
const
int
&
width
,
const
int
&
group
,
const
int
&
group_channels
,
const
scalar_t
&
h
,
const
scalar_t
&
w
,
const
int
&
g
,
const
int
&
c
)
{
const
int
h_low
=
floor
(
h
);
const
int
w_low
=
floor
(
w
);
const
int
h_high
=
h_low
+
1
;
const
int
w_high
=
w_low
+
1
;
const
scalar_t
lh
=
h
-
h_low
;
const
scalar_t
lw
=
w
-
w_low
;
const
scalar_t
hh
=
1
-
lh
,
hw
=
1
-
lw
;
const
int
w_stride
=
group
*
group_channels
;
const
int
h_stride
=
width
*
w_stride
;
const
int
h_low_ptr_offset
=
h_low
*
h_stride
;
const
int
h_high_ptr_offset
=
h_low_ptr_offset
+
h_stride
;
const
int
w_low_ptr_offset
=
w_low
*
w_stride
;
const
int
w_high_ptr_offset
=
w_low_ptr_offset
+
w_stride
;
const
int
base_ptr
=
g
*
group_channels
+
c
;
scalar_t
v1
=
0
;
if
(
h_low
>=
0
&&
w_low
>=
0
)
{
const
int
ptr1
=
h_low_ptr_offset
+
w_low_ptr_offset
+
base_ptr
;
v1
=
bottom_data
[
ptr1
];
}
scalar_t
v2
=
0
;
if
(
h_low
>=
0
&&
w_high
<=
width
-
1
)
{
const
int
ptr2
=
h_low_ptr_offset
+
w_high_ptr_offset
+
base_ptr
;
v2
=
bottom_data
[
ptr2
];
}
scalar_t
v3
=
0
;
if
(
h_high
<=
height
-
1
&&
w_low
>=
0
)
{
const
int
ptr3
=
h_high_ptr_offset
+
w_low_ptr_offset
+
base_ptr
;
v3
=
bottom_data
[
ptr3
];
}
scalar_t
v4
=
0
;
if
(
h_high
<=
height
-
1
&&
w_high
<=
width
-
1
)
{
const
int
ptr4
=
h_high_ptr_offset
+
w_high_ptr_offset
+
base_ptr
;
v4
=
bottom_data
[
ptr4
];
}
const
scalar_t
w1
=
hh
*
hw
,
w2
=
hh
*
lw
,
w3
=
lh
*
hw
,
w4
=
lh
*
lw
;
const
scalar_t
val
=
(
w1
*
v1
+
w2
*
v2
+
w3
*
v3
+
w4
*
v4
);
return
val
;
}
template
<
typename
scalar_t
>
__global__
void
dcnv3_im2col_gpu_kernel
(
const
int
num_kernels
,
const
scalar_t
*
data_im
,
const
scalar_t
*
data_offset
,
const
scalar_t
*
data_mask
,
scalar_t
*
data_col
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
int
height_in
,
const
int
width_in
,
const
int
height_out
,
const
int
width_out
,
const
scalar_t
offset_scale
)
{
CUDA_KERNEL_LOOP
(
index
,
num_kernels
)
{
int
_temp
=
index
;
const
int
c_col
=
_temp
%
group_channels
;
_temp
/=
group_channels
;
const
int
sampling_index
=
_temp
;
const
int
g_col
=
_temp
%
group
;
_temp
/=
group
;
const
int
p0_w
=
((
dilation_w
*
(
kernel_w
-
1
))
>>
1
)
-
pad_w
+
(
_temp
%
width_out
)
*
stride_w
;
_temp
/=
width_out
;
const
int
p0_h
=
((
dilation_h
*
(
kernel_h
-
1
))
>>
1
)
-
pad_h
+
(
_temp
%
height_out
)
*
stride_h
;
_temp
/=
height_out
;
const
int
b_col
=
_temp
;
const
int
input_size
=
height_in
*
width_in
;
scalar_t
*
data_col_ptr
=
data_col
+
index
;
const
int
kernel_size
=
kernel_h
*
kernel_w
;
int
data_weight_ptr
=
sampling_index
*
kernel_size
;
int
data_loc_w_ptr
=
data_weight_ptr
<<
1
;
const
int
qid_stride
=
group
*
group_channels
;
scalar_t
col
=
0
;
const
scalar_t
*
data_im_ptr
=
data_im
+
b_col
*
input_size
*
qid_stride
;
// top-left
const
scalar_t
p0_w_
=
p0_w
-
((
dilation_w
*
(
kernel_w
-
1
))
>>
1
)
*
offset_scale
;
const
scalar_t
p0_h_
=
p0_h
-
((
dilation_h
*
(
kernel_h
-
1
))
>>
1
)
*
offset_scale
;
for
(
int
i
=
0
;
i
<
kernel_w
;
++
i
)
{
for
(
int
j
=
0
;
j
<
kernel_h
;
++
j
)
{
const
scalar_t
offset_w
=
data_offset
[
data_loc_w_ptr
];
const
scalar_t
offset_h
=
data_offset
[
data_loc_w_ptr
+
1
];
const
scalar_t
loc_w
=
p0_w_
+
(
i
*
dilation_w
+
offset_w
)
*
offset_scale
;
const
scalar_t
loc_h
=
p0_h_
+
(
j
*
dilation_h
+
offset_h
)
*
offset_scale
;
const
scalar_t
weight
=
data_mask
[
data_weight_ptr
];
if
(
loc_h
>
-
1
&&
loc_w
>
-
1
&&
loc_h
<
height_in
&&
loc_w
<
width_in
)
{
col
+=
dcnv3_im2col_bilinear
(
data_im_ptr
,
height_in
,
width_in
,
group
,
group_channels
,
loc_h
,
loc_w
,
g_col
,
c_col
)
*
weight
;
}
data_weight_ptr
+=
1
;
data_loc_w_ptr
+=
2
;
}
}
*
data_col_ptr
=
col
;
}
}
template
<
typename
scalar_t
>
void
dcnv3_im2col_cuda
(
cudaStream_t
stream
,
const
scalar_t
*
data_im
,
const
scalar_t
*
data_offset
,
const
scalar_t
*
data_mask
,
scalar_t
*
data_col
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
int
batch_n
,
const
int
height_in
,
const
int
width_in
,
const
int
height_out
,
const
int
width_out
,
const
scalar_t
offset_scale
)
{
const
int
num_kernels
=
batch_n
*
height_out
*
width_out
*
group
*
group_channels
;
const
int
num_actual_kernels
=
batch_n
*
height_out
*
width_out
*
group
*
group_channels
;
const
int
num_threads
=
CUDA_NUM_THREADS
;
dcnv3_im2col_gpu_kernel
<
scalar_t
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
0
,
stream
>>>
(
num_kernels
,
data_im
,
data_offset
,
data_mask
,
data_col
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
);
cudaError_t
err
=
cudaGetLastError
();
if
(
err
!=
cudaSuccess
)
{
printf
(
"error in dcnv3_im2col_cuda: %s
\n
"
,
cudaGetErrorString
(
err
));
}
}
template
<
typename
scalar_t
>
void
dcnv3_cuda_forward
(
const
scalar_t
*
input
,
const
scalar_t
*
offset
,
const
scalar_t
*
mask
,
scalar_t
*
output
,
int
batch
,
int
channels
,
int
height_in
,
int
width_in
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
float
offset_scale
,
const
int
im2col_step
,
cudaStream_t
stream
)
{
const
int
height_out
=
(
height_in
+
2
*
pad_h
-
(
dilation_h
*
(
kernel_h
-
1
)
+
1
))
/
stride_h
+
1
;
const
int
width_out
=
(
width_in
+
2
*
pad_w
-
(
dilation_w
*
(
kernel_w
-
1
)
+
1
))
/
stride_w
+
1
;
const
int
im2col_step_
=
std
::
min
(
batch
,
im2col_step
);
const
int
batch_n
=
im2col_step_
;
auto
per_input_size
=
height_in
*
width_in
*
group
*
group_channels
;
auto
per_output_size
=
height_out
*
width_out
*
group
*
group_channels
;
auto
per_offset_size
=
height_out
*
width_out
*
group
*
kernel_h
*
kernel_w
*
2
;
auto
per_mask_size
=
height_out
*
width_out
*
group
*
kernel_h
*
kernel_w
;
for
(
int
n
=
0
;
n
<
batch
/
im2col_step_
;
++
n
)
{
// AT_DISPATCH_FLOATING_TYPES(
dcnv3_im2col_cuda
<
scalar_t
>
(
stream
,
input
+
n
*
im2col_step_
*
per_input_size
,
offset
+
n
*
im2col_step_
*
per_offset_size
,
mask
+
n
*
im2col_step_
*
per_mask_size
,
output
+
n
*
im2col_step_
*
per_output_size
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
batch_n
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
);
}
}
template
<
typename
scalar_t
>
void
DeformConvv3ForwardCUDAKernelLauncher
(
const
scalar_t
*
input
,
const
scalar_t
*
offset
,
const
scalar_t
*
mask
,
scalar_t
*
output
,
void
*
workspace
,
int
batch
,
int
channels
,
int
height
,
int
width
,
int
channels_out
,
int
kernel_w
,
int
kernel_h
,
int
stride_w
,
int
stride_h
,
int
pad_w
,
int
pad_h
,
int
dilation_w
,
int
dilation_h
,
int
group
,
int
group_channel
,
float
offset_scale
,
int
im2col_step
,
cudaStream_t
stream
)
{
dcnv3_cuda_forward
(
input
,
offset
,
mask
,
output
,
batch
,
channels
,
height
,
width
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channel
,
offset_scale
,
im2col_step
,
stream
);
}
template
void
DeformConvv3ForwardCUDAKernelLauncher
<
float
>(
const
float
*
input
,
const
float
*
offset
,
const
float
*
mask
,
float
*
output
,
void
*
workspace
,
int
batch
,
int
channels
,
int
height
,
int
width
,
int
channels_out
,
int
kernel_w
,
int
kernel_h
,
int
stride_w
,
int
stride_h
,
int
pad_w
,
int
pad_h
,
int
dilation_w
,
int
dilation_h
,
int
group
,
int
group_channel
,
float
offset_scale
,
int
im2col_step
,
cudaStream_t
stream
);
tensorrt/modulated_deform_conv_v3/trt_deform_conv_v3_kernel.hpp
0 → 100644
View file @
76b26f09
#ifndef TRT_DEFORM_CONV_V3_KERNEL_HPP
#define TRT_DEFORM_CONV_V3_KERNEL_HPP
#include <cuda_runtime.h>
#include "common_cuda_helper.hpp"
template
<
typename
scalar_t
>
void
DeformConvv3ForwardCUDAKernelLauncher
(
const
scalar_t
*
input
,
const
scalar_t
*
offset
,
const
scalar_t
*
mask
,
scalar_t
*
output
,
void
*
workspace
,
int
batch
,
int
channels
,
int
height
,
int
width
,
int
channels_out
,
int
kernel_w
,
int
kernel_h
,
int
stride_w
,
int
stride_h
,
int
pad_w
,
int
pad_h
,
int
dilation_w
,
int
dilation_h
,
int
group
,
int
group_channel
,
float
offset_scale
,
int
im2col_step
,
cudaStream_t
stream
);
#endif // TRT_DEFORM_CONV_V3_KERNEL_HPP
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment