Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
MMCV
Commits
2e5628b4
Unverified
Commit
2e5628b4
authored
Aug 26, 2022
by
q.yao
Committed by
GitHub
Aug 26, 2022
Browse files
[Refactor]: Remove deployment for dev-2.x (#2225)
* remove deploy for 2.0 * update onnx ut
parent
961373ad
Changes
94
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
0 additions
and
3468 deletions
+0
-3468
mmcv/ops/csrc/onnxruntime/ort_mmcv_utils.h
mmcv/ops/csrc/onnxruntime/ort_mmcv_utils.h
+0
-15
mmcv/ops/csrc/onnxruntime/reduce_ops.h
mmcv/ops/csrc/onnxruntime/reduce_ops.h
+0
-95
mmcv/ops/csrc/onnxruntime/roi_align.h
mmcv/ops/csrc/onnxruntime/roi_align.h
+0
-62
mmcv/ops/csrc/onnxruntime/roi_align_rotated.h
mmcv/ops/csrc/onnxruntime/roi_align_rotated.h
+0
-62
mmcv/ops/csrc/onnxruntime/rotated_feature_align.h
mmcv/ops/csrc/onnxruntime/rotated_feature_align.h
+0
-50
mmcv/ops/csrc/onnxruntime/soft_nms.h
mmcv/ops/csrc/onnxruntime/soft_nms.h
+0
-49
mmcv/ops/csrc/tensorrt/plugins/trt_corner_pool.cpp
mmcv/ops/csrc/tensorrt/plugins/trt_corner_pool.cpp
+0
-217
mmcv/ops/csrc/tensorrt/plugins/trt_corner_pool_kernel.cu
mmcv/ops/csrc/tensorrt/plugins/trt_corner_pool_kernel.cu
+0
-110
mmcv/ops/csrc/tensorrt/plugins/trt_cuda_helper.cu
mmcv/ops/csrc/tensorrt/plugins/trt_cuda_helper.cu
+0
-91
mmcv/ops/csrc/tensorrt/plugins/trt_cummaxmin.cpp
mmcv/ops/csrc/tensorrt/plugins/trt_cummaxmin.cpp
+0
-242
mmcv/ops/csrc/tensorrt/plugins/trt_cummaxmin_kernel.cu
mmcv/ops/csrc/tensorrt/plugins/trt_cummaxmin_kernel.cu
+0
-90
mmcv/ops/csrc/tensorrt/plugins/trt_deform_conv.cpp
mmcv/ops/csrc/tensorrt/plugins/trt_deform_conv.cpp
+0
-318
mmcv/ops/csrc/tensorrt/plugins/trt_deform_conv_kernel.cu
mmcv/ops/csrc/tensorrt/plugins/trt_deform_conv_kernel.cu
+0
-129
mmcv/ops/csrc/tensorrt/plugins/trt_grid_sampler.cpp
mmcv/ops/csrc/tensorrt/plugins/trt_grid_sampler.cpp
+0
-256
mmcv/ops/csrc/tensorrt/plugins/trt_grid_sampler_kernel.cu
mmcv/ops/csrc/tensorrt/plugins/trt_grid_sampler_kernel.cu
+0
-441
mmcv/ops/csrc/tensorrt/plugins/trt_instance_norm.cpp
mmcv/ops/csrc/tensorrt/plugins/trt_instance_norm.cpp
+0
-246
mmcv/ops/csrc/tensorrt/plugins/trt_modulated_deform_conv.cpp
mmcv/ops/csrc/tensorrt/plugins/trt_modulated_deform_conv.cpp
+0
-308
mmcv/ops/csrc/tensorrt/plugins/trt_modulated_deform_conv_kernel.cu
...csrc/tensorrt/plugins/trt_modulated_deform_conv_kernel.cu
+0
-134
mmcv/ops/csrc/tensorrt/plugins/trt_nms.cpp
mmcv/ops/csrc/tensorrt/plugins/trt_nms.cpp
+0
-279
mmcv/ops/csrc/tensorrt/plugins/trt_nms_kernel.cu
mmcv/ops/csrc/tensorrt/plugins/trt_nms_kernel.cu
+0
-274
No files found.
mmcv/ops/csrc/onnxruntime/ort_mmcv_utils.h
deleted
100644 → 0
View file @
961373ad
// Copyright (c) OpenMMLab. All rights reserved
#ifndef ORT_MMCV_UTILS_H
#define ORT_MMCV_UTILS_H
#include <onnxruntime_cxx_api.h>
#include <vector>
struct
OrtTensorDimensions
:
std
::
vector
<
int64_t
>
{
OrtTensorDimensions
(
Ort
::
CustomOpApi
ort
,
const
OrtValue
*
value
)
{
OrtTensorTypeAndShapeInfo
*
info
=
ort
.
GetTensorTypeAndShape
(
value
);
std
::
vector
<
int64_t
>::
operator
=
(
ort
.
GetTensorShape
(
info
));
ort
.
ReleaseTensorTypeAndShapeInfo
(
info
);
}
};
#endif // ORT_MMCV_UTILS_H
mmcv/ops/csrc/onnxruntime/reduce_ops.h
deleted
100644 → 0
View file @
961373ad
// Copyright (c) OpenMMLab. All rights reserved
#ifndef ONNXRUNTIME_REDUCE_OPS_H
#define ONNXRUNTIME_REDUCE_OPS_H
#include <onnxruntime_cxx_api.h>
struct
MMCVCumMaxKernel
{
public:
MMCVCumMaxKernel
(
Ort
::
CustomOpApi
ort
,
const
OrtKernelInfo
*
info
)
:
ort_
(
ort
)
{
dim_
=
ort_
.
KernelInfoGetAttribute
<
int64_t
>
(
info
,
"dim"
);
// create allocator
allocator_
=
Ort
::
AllocatorWithDefaultOptions
();
}
void
Compute
(
OrtKernelContext
*
context
);
private:
Ort
::
CustomOpApi
ort_
;
Ort
::
AllocatorWithDefaultOptions
allocator_
;
int64_t
dim_
;
};
struct
MMCVCumMinKernel
{
public:
MMCVCumMinKernel
(
Ort
::
CustomOpApi
ort
,
const
OrtKernelInfo
*
info
)
:
ort_
(
ort
)
{
dim_
=
ort_
.
KernelInfoGetAttribute
<
int64_t
>
(
info
,
"dim"
);
// create allocator
allocator_
=
Ort
::
AllocatorWithDefaultOptions
();
}
void
Compute
(
OrtKernelContext
*
context
);
private:
Ort
::
CustomOpApi
ort_
;
Ort
::
AllocatorWithDefaultOptions
allocator_
;
int64_t
dim_
;
};
struct
MMCVCumMaxCustomOp
:
Ort
::
CustomOpBase
<
MMCVCumMaxCustomOp
,
MMCVCumMaxKernel
>
{
void
*
CreateKernel
(
Ort
::
CustomOpApi
api
,
const
OrtKernelInfo
*
info
)
const
{
return
new
MMCVCumMaxKernel
(
api
,
info
);
}
const
char
*
GetName
()
const
{
return
"cummax"
;
}
size_t
GetInputTypeCount
()
const
{
return
1
;
}
ONNXTensorElementDataType
GetInputType
(
size_t
)
const
{
return
ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT
;
};
size_t
GetOutputTypeCount
()
const
{
return
2
;
}
ONNXTensorElementDataType
GetOutputType
(
size_t
index
)
const
{
if
(
index
==
1
)
return
ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64
;
return
ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT
;
};
// force cpu
const
char
*
GetExecutionProviderType
()
const
{
return
"CPUExecutionProvider"
;
};
};
struct
MMCVCumMinCustomOp
:
Ort
::
CustomOpBase
<
MMCVCumMinCustomOp
,
MMCVCumMinKernel
>
{
void
*
CreateKernel
(
Ort
::
CustomOpApi
api
,
const
OrtKernelInfo
*
info
)
const
{
return
new
MMCVCumMinKernel
(
api
,
info
);
}
const
char
*
GetName
()
const
{
return
"cummin"
;
}
size_t
GetInputTypeCount
()
const
{
return
1
;
}
ONNXTensorElementDataType
GetInputType
(
size_t
)
const
{
return
ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT
;
};
size_t
GetOutputTypeCount
()
const
{
return
2
;
}
ONNXTensorElementDataType
GetOutputType
(
size_t
index
)
const
{
if
(
index
==
1
)
return
ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64
;
return
ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT
;
};
// force cpu
const
char
*
GetExecutionProviderType
()
const
{
return
"CPUExecutionProvider"
;
};
};
#endif // ONNXRUNTIME_REDUCE_OPS_H
mmcv/ops/csrc/onnxruntime/roi_align.h
deleted
100644 → 0
View file @
961373ad
// Copyright (c) OpenMMLab. All rights reserved
#ifndef ONNXRUNTIME_ROI_ALIGN_H
#define ONNXRUNTIME_ROI_ALIGN_H
#include <assert.h>
#include <onnxruntime_cxx_api.h>
#include <cmath>
#include <mutex>
#include <string>
#include <vector>
struct
MMCVRoiAlignKernel
{
public:
MMCVRoiAlignKernel
(
Ort
::
CustomOpApi
ort
,
const
OrtKernelInfo
*
info
)
:
ort_
(
ort
)
{
aligned_
=
ort_
.
KernelInfoGetAttribute
<
int64_t
>
(
info
,
"aligned"
);
aligned_height_
=
ort_
.
KernelInfoGetAttribute
<
int64_t
>
(
info
,
"output_height"
);
aligned_width_
=
ort_
.
KernelInfoGetAttribute
<
int64_t
>
(
info
,
"output_width"
);
pool_mode_
=
ort_
.
KernelInfoGetAttribute
<
std
::
string
>
(
info
,
"mode"
);
sampling_ratio_
=
ort_
.
KernelInfoGetAttribute
<
int64_t
>
(
info
,
"sampling_ratio"
);
spatial_scale_
=
ort_
.
KernelInfoGetAttribute
<
float
>
(
info
,
"spatial_scale"
);
}
void
Compute
(
OrtKernelContext
*
context
);
private:
Ort
::
CustomOpApi
ort_
;
int
aligned_height_
;
int
aligned_width_
;
float
spatial_scale_
;
int
sampling_ratio_
;
std
::
string
pool_mode_
;
int
aligned_
;
};
struct
MMCVRoiAlignCustomOp
:
Ort
::
CustomOpBase
<
MMCVRoiAlignCustomOp
,
MMCVRoiAlignKernel
>
{
void
*
CreateKernel
(
Ort
::
CustomOpApi
api
,
const
OrtKernelInfo
*
info
)
const
{
return
new
MMCVRoiAlignKernel
(
api
,
info
);
}
const
char
*
GetName
()
const
{
return
"MMCVRoiAlign"
;
}
size_t
GetInputTypeCount
()
const
{
return
2
;
}
ONNXTensorElementDataType
GetInputType
(
size_t
)
const
{
return
ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT
;
}
size_t
GetOutputTypeCount
()
const
{
return
1
;
}
ONNXTensorElementDataType
GetOutputType
(
size_t
)
const
{
return
ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT
;
}
// force cpu
const
char
*
GetExecutionProviderType
()
const
{
return
"CPUExecutionProvider"
;
}
};
#endif // ONNXRUNTIME_ROI_ALIGN_H
mmcv/ops/csrc/onnxruntime/roi_align_rotated.h
deleted
100644 → 0
View file @
961373ad
// Copyright (c) OpenMMLab. All rights reserved
#ifndef ONNXRUNTIME_ROI_ALIGN_ROTATED_H
#define ONNXRUNTIME_ROI_ALIGN_ROTATED_H
#include <assert.h>
#include <onnxruntime_cxx_api.h>
#include <cmath>
#include <mutex>
#include <string>
#include <vector>
struct
MMCVRoIAlignRotatedKernel
{
public:
MMCVRoIAlignRotatedKernel
(
Ort
::
CustomOpApi
ort
,
const
OrtKernelInfo
*
info
)
:
ort_
(
ort
)
{
aligned_height_
=
ort_
.
KernelInfoGetAttribute
<
int64_t
>
(
info
,
"output_height"
);
aligned_width_
=
ort_
.
KernelInfoGetAttribute
<
int64_t
>
(
info
,
"output_width"
);
sampling_ratio_
=
ort_
.
KernelInfoGetAttribute
<
int64_t
>
(
info
,
"sampling_ratio"
);
spatial_scale_
=
ort_
.
KernelInfoGetAttribute
<
float
>
(
info
,
"spatial_scale"
);
aligned_
=
ort_
.
KernelInfoGetAttribute
<
int64_t
>
(
info
,
"aligned"
);
clockwise_
=
ort_
.
KernelInfoGetAttribute
<
int64_t
>
(
info
,
"clockwise"
);
}
void
Compute
(
OrtKernelContext
*
context
);
private:
Ort
::
CustomOpApi
ort_
;
int
aligned_height_
;
int
aligned_width_
;
float
spatial_scale_
;
int
sampling_ratio_
;
int
aligned_
;
int
clockwise_
;
};
struct
MMCVRoIAlignRotatedCustomOp
:
Ort
::
CustomOpBase
<
MMCVRoIAlignRotatedCustomOp
,
MMCVRoIAlignRotatedKernel
>
{
void
*
CreateKernel
(
Ort
::
CustomOpApi
api
,
const
OrtKernelInfo
*
info
)
const
{
return
new
MMCVRoIAlignRotatedKernel
(
api
,
info
);
}
const
char
*
GetName
()
const
{
return
"MMCVRoIAlignRotated"
;
}
size_t
GetInputTypeCount
()
const
{
return
2
;
}
ONNXTensorElementDataType
GetInputType
(
size_t
)
const
{
return
ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT
;
}
size_t
GetOutputTypeCount
()
const
{
return
1
;
}
ONNXTensorElementDataType
GetOutputType
(
size_t
)
const
{
return
ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT
;
}
// force cpu
const
char
*
GetExecutionProviderType
()
const
{
return
"CPUExecutionProvider"
;
}
};
#endif // ONNXRUNTIME_ROI_ALIGN_ROTATED_H
mmcv/ops/csrc/onnxruntime/rotated_feature_align.h
deleted
100644 → 0
View file @
961373ad
#ifndef ONNXRUNTIME_ROTATED_FEATURE_ALIGN_H
#define ONNXRUNTIME_ROTATED_FEATURE_ALIGN_H
#include <onnxruntime_cxx_api.h>
#include <cmath>
struct
MMCVRotatedFeatureAlignKernel
{
public:
MMCVRotatedFeatureAlignKernel
(
Ort
::
CustomOpApi
ort
,
const
OrtKernelInfo
*
info
)
:
ort_
(
ort
)
{
spatial_scale_
=
ort_
.
KernelInfoGetAttribute
<
float
>
(
info
,
"spatial_scale"
);
points_
=
ort_
.
KernelInfoGetAttribute
<
int64_t
>
(
info
,
"points"
);
}
void
Compute
(
OrtKernelContext
*
context
);
private:
Ort
::
CustomOpApi
ort_
;
float
spatial_scale_
;
int
points_
;
};
struct
MMCVRotatedFeatureAlignCustomOp
:
Ort
::
CustomOpBase
<
MMCVRotatedFeatureAlignCustomOp
,
MMCVRotatedFeatureAlignKernel
>
{
void
*
CreateKernel
(
Ort
::
CustomOpApi
api
,
const
OrtKernelInfo
*
info
)
const
{
return
new
MMCVRotatedFeatureAlignKernel
(
api
,
info
);
}
const
char
*
GetName
()
const
{
return
"MMCVRotatedFeatureAlign"
;
}
size_t
GetInputTypeCount
()
const
{
return
2
;
}
ONNXTensorElementDataType
GetInputType
(
size_t
)
const
{
return
ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT
;
}
size_t
GetOutputTypeCount
()
const
{
return
1
;
}
ONNXTensorElementDataType
GetOutputType
(
size_t
)
const
{
return
ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT
;
}
// force cpu
const
char
*
GetExecutionProviderType
()
const
{
return
"CPUExecutionProvider"
;
}
};
#endif // ONNXRUNTIME_ROTATED_FEATURE_ALIGN_H
mmcv/ops/csrc/onnxruntime/soft_nms.h
deleted
100644 → 0
View file @
961373ad
// Copyright (c) OpenMMLab. All rights reserved
#ifndef ONNXRUNTIME_SOFT_NMS_H
#define ONNXRUNTIME_SOFT_NMS_H
#include <onnxruntime_cxx_api.h>
struct
SoftNmsKernel
{
SoftNmsKernel
(
OrtApi
api
,
const
OrtKernelInfo
*
info
);
void
Compute
(
OrtKernelContext
*
context
);
protected:
OrtApi
api_
;
Ort
::
CustomOpApi
ort_
;
const
OrtKernelInfo
*
info_
;
Ort
::
AllocatorWithDefaultOptions
allocator_
;
float
iou_threshold_
;
float
sigma_
;
float
min_score_
;
int64_t
method_
;
int64_t
offset_
;
};
struct
SoftNmsOp
:
Ort
::
CustomOpBase
<
SoftNmsOp
,
SoftNmsKernel
>
{
void
*
CreateKernel
(
OrtApi
api
,
const
OrtKernelInfo
*
info
)
const
{
return
new
SoftNmsKernel
(
api
,
info
);
};
const
char
*
GetName
()
const
{
return
"SoftNonMaxSuppression"
;
};
size_t
GetInputTypeCount
()
const
{
return
2
;
};
ONNXTensorElementDataType
GetInputType
(
size_t
/*index*/
)
const
{
return
ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT
;
};
size_t
GetOutputTypeCount
()
const
{
return
2
;
};
ONNXTensorElementDataType
GetOutputType
(
size_t
index
)
const
{
if
(
index
==
1
)
{
return
ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64
;
}
return
ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT
;
};
// force cpu
const
char
*
GetExecutionProviderType
()
const
{
return
"CPUExecutionProvider"
;
};
};
#endif // ONNXRUNTIME_SOFT_NMS_H
mmcv/ops/csrc/tensorrt/plugins/trt_corner_pool.cpp
deleted
100644 → 0
View file @
961373ad
// Copyright (c) OpenMMLab. All rights reserved
#include "trt_corner_pool.hpp"
#include <assert.h>
#include "trt_serialize.hpp"
void
CornerPoolForwardLauncher_float
(
const
float
*
input
,
float
*
output
,
const
int
batch_size
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
pool_type
,
cudaStream_t
stream
);
namespace
{
static
const
char
*
PLUGIN_VERSION
{
"1"
};
static
const
char
*
CORNER_POOL_PLUGIN_NAME
{
"MMCVCornerPool"
};
}
// namespace
CornerPoolPluginDynamic
::
CornerPoolPluginDynamic
(
const
std
::
string
&
name
,
TRT_CORNER_POOL_TYPE
poolType
)
:
mLayerName
(
name
),
mPoolType
(
poolType
)
{}
CornerPoolPluginDynamic
::
CornerPoolPluginDynamic
(
const
std
::
string
name
,
const
void
*
data
,
size_t
length
)
:
mLayerName
(
name
)
{
deserialize_value
(
&
data
,
&
length
,
&
mPoolType
);
}
CornerPoolPluginDynamic
::~
CornerPoolPluginDynamic
()
{}
nvinfer1
::
IPluginV2DynamicExt
*
CornerPoolPluginDynamic
::
clone
()
const
{
CornerPoolPluginDynamic
*
plugin
=
new
CornerPoolPluginDynamic
(
mLayerName
,
mPoolType
);
plugin
->
setPluginNamespace
(
getPluginNamespace
());
return
plugin
;
}
nvinfer1
::
DimsExprs
CornerPoolPluginDynamic
::
getOutputDimensions
(
int
outputIndex
,
const
nvinfer1
::
DimsExprs
*
inputs
,
int
nbInputs
,
nvinfer1
::
IExprBuilder
&
exprBuilder
)
{
return
inputs
[
0
];
}
bool
CornerPoolPluginDynamic
::
supportsFormatCombination
(
int
pos
,
const
nvinfer1
::
PluginTensorDesc
*
inOut
,
int
nbInputs
,
int
nbOutputs
)
{
switch
(
pos
)
{
// input[0]
case
0
:
return
inOut
[
pos
].
type
==
nvinfer1
::
DataType
::
kFLOAT
&&
inOut
[
pos
].
format
==
nvinfer1
::
TensorFormat
::
kLINEAR
;
// output[0]
case
1
:
return
inOut
[
pos
].
type
==
inOut
[
0
].
type
&&
inOut
[
pos
].
format
==
inOut
[
0
].
format
;
default:
return
false
;
}
}
void
CornerPoolPluginDynamic
::
configurePlugin
(
const
nvinfer1
::
DynamicPluginTensorDesc
*
inputs
,
int
nbInputs
,
const
nvinfer1
::
DynamicPluginTensorDesc
*
outputs
,
int
nbOutputs
)
{}
size_t
CornerPoolPluginDynamic
::
getWorkspaceSize
(
const
nvinfer1
::
PluginTensorDesc
*
inputs
,
int
nbInputs
,
const
nvinfer1
::
PluginTensorDesc
*
outputs
,
int
nbOutputs
)
const
{
int
sizeof_dtype
=
mmcv
::
getElementSize
(
outputs
[
0
].
type
);
}
int
CornerPoolPluginDynamic
::
enqueue
(
const
nvinfer1
::
PluginTensorDesc
*
inputDesc
,
const
nvinfer1
::
PluginTensorDesc
*
outputDesc
,
const
void
*
const
*
inputs
,
void
*
const
*
outputs
,
void
*
workSpace
,
cudaStream_t
stream
)
{
const
void
*
input
=
inputs
[
0
];
void
*
output_value
=
outputs
[
0
];
const
int
batch_size
=
inputDesc
[
0
].
dims
.
d
[
0
];
const
int
channels
=
inputDesc
[
0
].
dims
.
d
[
1
];
const
int
height
=
inputDesc
[
0
].
dims
.
d
[
2
];
const
int
width
=
inputDesc
[
0
].
dims
.
d
[
3
];
CornerPoolForwardLauncher_float
((
float
*
)
input
,
(
float
*
)
output_value
,
batch_size
,
channels
,
height
,
width
,
int
(
mPoolType
),
stream
);
return
0
;
}
nvinfer1
::
DataType
CornerPoolPluginDynamic
::
getOutputDataType
(
int
index
,
const
nvinfer1
::
DataType
*
inputTypes
,
int
nbInputs
)
const
{
return
inputTypes
[
0
];
}
// IPluginV2 Methods
const
char
*
CornerPoolPluginDynamic
::
getPluginType
()
const
{
switch
(
mPoolType
)
{
case
TRT_CORNER_POOL_TYPE
::
TRT_TOP_POOL
:
case
TRT_CORNER_POOL_TYPE
::
TRT_BOTTOM_POOL
:
case
TRT_CORNER_POOL_TYPE
::
TRT_LEFT_POOL
:
case
TRT_CORNER_POOL_TYPE
::
TRT_RIGHT_POOL
:
return
CORNER_POOL_PLUGIN_NAME
;
default:
return
"UnknownpoolType"
;
}
}
const
char
*
CornerPoolPluginDynamic
::
getPluginVersion
()
const
{
return
PLUGIN_VERSION
;
}
int
CornerPoolPluginDynamic
::
getNbOutputs
()
const
{
return
1
;
}
int
CornerPoolPluginDynamic
::
initialize
()
{
return
0
;
}
void
CornerPoolPluginDynamic
::
terminate
()
{}
size_t
CornerPoolPluginDynamic
::
getSerializationSize
()
const
{
return
sizeof
(
mPoolType
);
}
void
CornerPoolPluginDynamic
::
serialize
(
void
*
buffer
)
const
{
serialize_value
(
&
buffer
,
mPoolType
);
}
void
CornerPoolPluginDynamic
::
destroy
()
{
// This gets called when the network containing plugin is destroyed
delete
this
;
}
void
CornerPoolPluginDynamic
::
setPluginNamespace
(
const
char
*
libNamespace
)
{
mNamespace
=
libNamespace
;
}
const
char
*
CornerPoolPluginDynamic
::
getPluginNamespace
()
const
{
return
mNamespace
.
c_str
();
}
CornerPoolPluginDynamicCreator
::
CornerPoolPluginDynamicCreator
()
{
mPluginAttributes
.
clear
();
mPluginAttributes
.
emplace_back
(
nvinfer1
::
PluginField
(
"mode"
));
mFC
.
nbFields
=
mPluginAttributes
.
size
();
mFC
.
fields
=
mPluginAttributes
.
data
();
}
const
char
*
CornerPoolPluginDynamicCreator
::
getPluginName
()
const
{
return
CORNER_POOL_PLUGIN_NAME
;
}
const
char
*
CornerPoolPluginDynamicCreator
::
getPluginVersion
()
const
{
return
PLUGIN_VERSION
;
}
const
nvinfer1
::
PluginFieldCollection
*
CornerPoolPluginDynamicCreator
::
getFieldNames
()
{
return
&
mFC
;
}
nvinfer1
::
IPluginV2
*
CornerPoolPluginDynamicCreator
::
createPlugin
(
const
char
*
name
,
const
nvinfer1
::
PluginFieldCollection
*
fc
)
{
TRT_CORNER_POOL_TYPE
poolType
;
int
poolMode
=
-
1
;
for
(
int
i
=
0
;
i
<
fc
->
nbFields
;
i
++
)
{
if
(
fc
->
fields
[
i
].
data
==
nullptr
)
{
continue
;
}
std
::
string
field_name
(
fc
->
fields
[
i
].
name
);
if
(
field_name
.
compare
(
"mode"
)
==
0
)
{
poolMode
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
0
];
}
}
assert
(
poolMode
>=
0
&&
poolMode
<=
3
);
switch
(
poolMode
)
{
case
0
:
poolType
=
TRT_CORNER_POOL_TYPE
::
TRT_TOP_POOL
;
break
;
case
1
:
poolType
=
TRT_CORNER_POOL_TYPE
::
TRT_BOTTOM_POOL
;
break
;
case
2
:
poolType
=
TRT_CORNER_POOL_TYPE
::
TRT_LEFT_POOL
;
break
;
case
3
:
poolType
=
TRT_CORNER_POOL_TYPE
::
TRT_RIGHT_POOL
;
break
;
default:
break
;
}
CornerPoolPluginDynamic
*
plugin
=
new
CornerPoolPluginDynamic
(
name
,
poolType
);
plugin
->
setPluginNamespace
(
getPluginNamespace
());
return
plugin
;
}
nvinfer1
::
IPluginV2
*
CornerPoolPluginDynamicCreator
::
deserializePlugin
(
const
char
*
name
,
const
void
*
serialData
,
size_t
serialLength
)
{
// This object will be deleted when the network is destroyed, which will
// call FCPluginDynamic::destroy()
auto
plugin
=
new
CornerPoolPluginDynamic
(
name
,
serialData
,
serialLength
);
plugin
->
setPluginNamespace
(
getPluginNamespace
());
return
plugin
;
}
void
CornerPoolPluginDynamicCreator
::
setPluginNamespace
(
const
char
*
libNamespace
)
{
mNamespace
=
libNamespace
;
}
const
char
*
CornerPoolPluginDynamicCreator
::
getPluginNamespace
()
const
{
return
mNamespace
.
c_str
();
}
mmcv/ops/csrc/tensorrt/plugins/trt_corner_pool_kernel.cu
deleted
100644 → 0
View file @
961373ad
// Copyright (c) OpenMMLab. All rights reserved
#include "common_cuda_helper.hpp"
#include "trt_cuda_helper.cuh"
#include "trt_plugin_helper.hpp"
template
<
typename
scalar_t
>
__global__
void
top_bottom_pool_kernel
(
const
scalar_t
*
input
,
scalar_t
*
output
,
const
int
batch_size
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
pool_type
)
{
const
int
nthreads
=
batch_size
*
channels
*
width
;
CUDA_1D_KERNEL_LOOP
(
index
,
nthreads
)
{
int
n_idx
=
index
/
(
channels
*
width
);
// batch
int
w_idx
=
index
%
width
;
// width
int
c_idx
=
(
index
/
width
)
%
channels
;
// channels
int
offset_n
=
n_idx
*
channels
*
width
*
height
;
int
offset_n_c
=
offset_n
+
c_idx
*
width
*
height
;
int
direction
=
-
1
;
// in [-1, 1], default for TopPool
int
index_start
=
height
-
2
;
// default for TopPool
// pool_type in [0, 1]
if
(
pool_type
==
0
)
{
// TopPool
// directly copy the most bottom value from input to output
output
[
offset_n_c
+
(
height
-
1
)
*
width
+
w_idx
]
=
input
[
offset_n_c
+
(
height
-
1
)
*
width
+
w_idx
];
}
else
{
// BottomPool
// directly copy the most top value from input to output
output
[
offset_n_c
+
w_idx
]
=
input
[
offset_n_c
+
w_idx
];
index_start
=
1
;
direction
=
1
;
}
// do pool
for
(
int
h
=
index_start
;
h
>=
0
&&
h
<
height
;
h
+=
direction
)
{
output
[
offset_n_c
+
h
*
width
+
w_idx
]
=
max
(
output
[
offset_n_c
+
(
h
-
direction
)
*
width
+
w_idx
],
input
[
offset_n_c
+
h
*
width
+
w_idx
]);
}
}
}
template
<
typename
scalar_t
>
__global__
void
left_right_pool_kernel
(
const
scalar_t
*
input
,
scalar_t
*
output
,
const
int
batch_size
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
pool_type
)
{
const
int
nthreads
=
batch_size
*
channels
*
height
;
CUDA_1D_KERNEL_LOOP
(
index
,
nthreads
)
{
int
n_idx
=
index
/
(
channels
*
height
);
// batch
int
h_idx
=
index
%
height
;
// height
int
c_idx
=
(
index
/
height
)
%
channels
;
// channels
int
offset_n
=
n_idx
*
channels
*
width
*
height
;
int
offset_n_c
=
offset_n
+
c_idx
*
width
*
height
;
int
offset_n_c_h
=
offset_n_c
+
h_idx
*
width
;
int
direction
=
-
1
;
// in [-1, 1], default for LeftPool
int
index_start
=
width
-
2
;
// default for LeftPool
// pool_type in [2, 3]
if
(
pool_type
==
2
)
{
// LeftPool
// directly copy the most right value from input to output
output
[
offset_n_c_h
+
width
-
1
]
=
input
[
offset_n_c_h
+
width
-
1
];
}
else
{
// RightPool
// directly copy the most left value from input to output
output
[
offset_n_c_h
]
=
input
[
offset_n_c_h
];
index_start
=
1
;
direction
=
1
;
}
// do pool
for
(
int
w
=
index_start
;
w
>=
0
&&
w
<
width
;
w
+=
direction
)
{
output
[
offset_n_c_h
+
w
]
=
max
(
output
[
offset_n_c_h
+
w
-
direction
],
input
[
offset_n_c_h
+
w
]);
}
}
}
template
<
typename
scalar_t
>
void
CornerPoolForwardLauncher
(
const
scalar_t
*
input
,
scalar_t
*
output
,
const
int
batch_size
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
pool_type
,
cudaStream_t
stream
)
{
int
nthreads
=
-
1
,
col_block
=
-
1
;
switch
(
pool_type
)
{
case
0
:
case
1
:
nthreads
=
batch_size
*
channels
*
width
;
col_block
=
GET_BLOCKS
(
nthreads
,
THREADS_PER_BLOCK
);
top_bottom_pool_kernel
<
scalar_t
>
<<<
col_block
,
THREADS_PER_BLOCK
,
0
,
stream
>>>
(
input
,
output
,
batch_size
,
channels
,
height
,
width
,
pool_type
);
break
;
case
2
:
case
3
:
nthreads
=
batch_size
*
channels
*
height
;
col_block
=
GET_BLOCKS
(
nthreads
,
THREADS_PER_BLOCK
);
left_right_pool_kernel
<
scalar_t
>
<<<
col_block
,
THREADS_PER_BLOCK
,
0
,
stream
>>>
(
input
,
output
,
batch_size
,
channels
,
height
,
width
,
pool_type
);
break
;
}
}
void
CornerPoolForwardLauncher_float
(
const
float
*
input
,
float
*
output
,
const
int
batch_size
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
pool_type
,
cudaStream_t
stream
)
{
CornerPoolForwardLauncher
<
float
>
(
input
,
output
,
batch_size
,
channels
,
height
,
width
,
pool_type
,
stream
);
}
mmcv/ops/csrc/tensorrt/plugins/trt_cuda_helper.cu
deleted
100644 → 0
View file @
961373ad
// Copyright (c) OpenMMLab. All rights reserved
#include <cublas_v2.h>
#include "common_cuda_helper.hpp"
#include "trt_cuda_helper.cuh"
#include "trt_plugin_helper.hpp"
using
mmcv
::
TensorDesc
;
template
<
class
scalar_t
>
__global__
void
copy_permute_kernel
(
scalar_t
*
dst
,
const
scalar_t
*
src
,
int
n
,
TensorDesc
ts_src_stride
,
TensorDesc
ts_dst_stride
,
TensorDesc
ts_permute
)
{
const
int
src_dim
=
ts_src_stride
.
dim
;
int
*
src_stride
=
&
(
ts_src_stride
.
stride
[
0
]);
int
*
dst_stride
=
&
(
ts_dst_stride
.
stride
[
0
]);
int
*
permute
=
&
(
ts_permute
.
shape
[
0
]);
CUDA_1D_KERNEL_LOOP
(
index
,
n
)
{
size_t
dst_index
=
index
;
size_t
src_index
=
0
;
for
(
int
i
=
0
;
i
<
src_dim
;
++
i
)
{
int
dim_index
=
dst_index
/
dst_stride
[
i
];
dst_index
=
dst_index
%
dst_stride
[
i
];
src_index
+=
dim_index
*
src_stride
[
permute
[
i
]];
}
dst
[
index
]
=
src
[
src_index
];
}
}
template
<
class
scalar_t
>
void
memcpyPermute
(
scalar_t
*
dst
,
const
scalar_t
*
src
,
int
*
src_size
,
int
*
permute
,
int
src_dim
,
cudaStream_t
stream
)
{
size_t
copy_size
=
1
;
TensorDesc
ts_permute
;
memcpy
(
&
(
ts_permute
.
shape
[
0
]),
permute
,
src_dim
*
sizeof
(
int
));
TensorDesc
ts_src_stride
;
TensorDesc
ts_dst_stride
;
ts_src_stride
.
dim
=
src_dim
;
ts_dst_stride
.
dim
=
src_dim
;
int
*
src_stride
=
&
(
ts_src_stride
.
stride
[
0
]);
int
*
dst_stride
=
&
(
ts_dst_stride
.
stride
[
0
]);
int
*
dst_size
=
&
(
ts_dst_stride
.
shape
[
0
]);
src_stride
[
src_dim
-
1
]
=
1
;
dst_stride
[
src_dim
-
1
]
=
1
;
for
(
int
i
=
src_dim
-
1
;
i
>=
0
;
--
i
)
{
dst_size
[
i
]
=
src_size
[
permute
[
i
]];
if
(
i
<
src_dim
-
1
)
{
src_stride
[
i
]
=
src_stride
[
i
+
1
]
*
src_size
[
i
+
1
];
}
}
for
(
int
i
=
src_dim
-
1
;
i
>=
0
;
--
i
)
{
copy_size
*=
dst_size
[
i
];
if
(
i
<
src_dim
-
1
)
{
dst_stride
[
i
]
=
dst_stride
[
i
+
1
]
*
dst_size
[
i
+
1
];
}
}
copy_permute_kernel
<
scalar_t
>
<<<
GET_BLOCKS
(
copy_size
),
THREADS_PER_BLOCK
,
0
,
stream
>>>
(
dst
,
src
,
copy_size
,
ts_src_stride
,
ts_dst_stride
,
ts_permute
);
}
template
void
memcpyPermute
<
float
>(
float
*
dst
,
const
float
*
src
,
int
*
src_size
,
int
*
permute
,
int
src_dim
,
cudaStream_t
stream
);
template
<
>
cublasStatus_t
cublasGemmWrap
<
float
>
(
cublasHandle_t
handle
,
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
const
float
*
beta
,
float
*
C
,
int
ldc
)
{
return
cublasSgemm
(
handle
,
transa
,
transb
,
m
,
n
,
k
,
alpha
,
A
,
lda
,
B
,
ldb
,
beta
,
C
,
ldc
);
}
template
<
>
cublasStatus_t
cublasGemmWrap
<
half
>
(
cublasHandle_t
handle
,
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
m
,
int
n
,
int
k
,
const
half
*
alpha
,
const
half
*
A
,
int
lda
,
const
half
*
B
,
int
ldb
,
const
half
*
beta
,
half
*
C
,
int
ldc
)
{
return
cublasHgemm
(
handle
,
transa
,
transb
,
m
,
n
,
k
,
alpha
,
A
,
lda
,
B
,
ldb
,
beta
,
C
,
ldc
);
}
mmcv/ops/csrc/tensorrt/plugins/trt_cummaxmin.cpp
deleted
100644 → 0
View file @
961373ad
// Copyright (c) OpenMMLab. All rights reserved
#include "trt_cummaxmin.hpp"
#include <assert.h>
#include "trt_serialize.hpp"
void
CumMaxMinForwardLauncher_float
(
const
float
*
input
,
float
*
output_value
,
int
*
output_index
,
const
int
*
dims
,
int
nbDims
,
int
cum_dim
,
int
cum_type
,
cudaStream_t
stream
);
void
CumMaxMinForwardLauncher_int32
(
const
int
*
input
,
int
*
output_value
,
int
*
output_index
,
const
int
*
dims
,
int
nbDims
,
int
cum_dim
,
int
cum_type
,
cudaStream_t
stream
);
namespace
{
static
const
char
*
PLUGIN_VERSION
{
"1"
};
static
const
char
*
CUMMAXMIN_PLUGIN_NAME
{
"cummaxmin"
};
static
const
char
*
CUMMAX_PLUGIN_NAME
{
"cummax"
};
static
const
char
*
CUMMIN_PLUGIN_NAME
{
"cummin"
};
}
// namespace
CumMaxMinPluginDynamic
::
CumMaxMinPluginDynamic
(
const
std
::
string
&
name
,
int
dim
,
TRT_CUMCMPTYPE
cumType
)
:
mLayerName
(
name
),
mDim
(
dim
),
mCumType
(
cumType
)
{}
CumMaxMinPluginDynamic
::
CumMaxMinPluginDynamic
(
const
std
::
string
name
,
const
void
*
data
,
size_t
length
)
:
mLayerName
(
name
)
{
deserialize_value
(
&
data
,
&
length
,
&
mDim
);
deserialize_value
(
&
data
,
&
length
,
&
mCumType
);
}
CumMaxMinPluginDynamic
::~
CumMaxMinPluginDynamic
()
{}
nvinfer1
::
IPluginV2DynamicExt
*
CumMaxMinPluginDynamic
::
clone
()
const
{
CumMaxMinPluginDynamic
*
plugin
=
new
CumMaxMinPluginDynamic
(
mLayerName
,
mDim
,
mCumType
);
plugin
->
setPluginNamespace
(
getPluginNamespace
());
return
plugin
;
}
nvinfer1
::
DimsExprs
CumMaxMinPluginDynamic
::
getOutputDimensions
(
int
outputIndex
,
const
nvinfer1
::
DimsExprs
*
inputs
,
int
nbInputs
,
nvinfer1
::
IExprBuilder
&
exprBuilder
)
{
return
inputs
[
0
];
}
bool
CumMaxMinPluginDynamic
::
supportsFormatCombination
(
int
pos
,
const
nvinfer1
::
PluginTensorDesc
*
inOut
,
int
nbInputs
,
int
nbOutputs
)
{
switch
(
pos
)
{
// input[0]
case
0
:
return
(
inOut
[
pos
].
type
==
nvinfer1
::
DataType
::
kFLOAT
||
inOut
[
pos
].
type
==
nvinfer1
::
DataType
::
kINT32
)
&&
inOut
[
pos
].
format
==
nvinfer1
::
TensorFormat
::
kLINEAR
;
// output[0]
case
1
:
return
inOut
[
pos
].
type
==
inOut
[
0
].
type
&&
inOut
[
pos
].
format
==
inOut
[
0
].
format
;
// output[1]
case
2
:
return
inOut
[
pos
].
type
==
nvinfer1
::
DataType
::
kINT32
&&
inOut
[
pos
].
format
==
nvinfer1
::
TensorFormat
::
kLINEAR
;
default:
return
false
;
}
}
void
CumMaxMinPluginDynamic
::
configurePlugin
(
const
nvinfer1
::
DynamicPluginTensorDesc
*
inputs
,
int
nbInputs
,
const
nvinfer1
::
DynamicPluginTensorDesc
*
outputs
,
int
nbOutputs
)
{}
size_t
CumMaxMinPluginDynamic
::
getWorkspaceSize
(
const
nvinfer1
::
PluginTensorDesc
*
inputs
,
int
nbInputs
,
const
nvinfer1
::
PluginTensorDesc
*
outputs
,
int
nbOutputs
)
const
{
int
sizeof_dtype
=
mmcv
::
getElementSize
(
outputs
[
0
].
type
);
}
int
CumMaxMinPluginDynamic
::
enqueue
(
const
nvinfer1
::
PluginTensorDesc
*
inputDesc
,
const
nvinfer1
::
PluginTensorDesc
*
outputDesc
,
const
void
*
const
*
inputs
,
void
*
const
*
outputs
,
void
*
workSpace
,
cudaStream_t
stream
)
{
const
void
*
input
=
inputs
[
0
];
void
*
output_value
=
outputs
[
0
];
int
*
output_index
=
(
int
*
)
outputs
[
1
];
const
int
*
dims
=
&
(
inputDesc
[
0
].
dims
.
d
[
0
]);
int
nbDims
=
inputDesc
[
0
].
dims
.
nbDims
;
switch
(
inputDesc
[
0
].
type
)
{
case
nvinfer1
::
DataType
::
kFLOAT
:
CumMaxMinForwardLauncher_float
((
float
*
)
input
,
(
float
*
)
output_value
,
output_index
,
dims
,
nbDims
,
mDim
,
int
(
mCumType
),
stream
);
break
;
case
nvinfer1
::
DataType
::
kINT32
:
CumMaxMinForwardLauncher_int32
((
int
*
)
input
,
(
int
*
)
output_value
,
output_index
,
dims
,
nbDims
,
mDim
,
int
(
mCumType
),
stream
);
break
;
default:
break
;
}
return
0
;
}
nvinfer1
::
DataType
CumMaxMinPluginDynamic
::
getOutputDataType
(
int
index
,
const
nvinfer1
::
DataType
*
inputTypes
,
int
nbInputs
)
const
{
switch
(
index
)
{
case
0
:
return
inputTypes
[
0
];
case
1
:
return
nvinfer1
::
DataType
::
kINT32
;
default:
break
;
}
}
// IPluginV2 Methods
const
char
*
CumMaxMinPluginDynamic
::
getPluginType
()
const
{
switch
(
mCumType
)
{
case
TRT_CUMCMPTYPE
::
TRT_CUMMAX
:
return
CUMMAX_PLUGIN_NAME
;
case
TRT_CUMCMPTYPE
::
TRT_CUMMIN
:
return
CUMMIN_PLUGIN_NAME
;
default:
return
"UnknownCumType"
;
}
}
const
char
*
CumMaxMinPluginDynamic
::
getPluginVersion
()
const
{
return
PLUGIN_VERSION
;
}
int
CumMaxMinPluginDynamic
::
getNbOutputs
()
const
{
return
2
;
}
int
CumMaxMinPluginDynamic
::
initialize
()
{
return
0
;
}
void
CumMaxMinPluginDynamic
::
terminate
()
{}
size_t
CumMaxMinPluginDynamic
::
getSerializationSize
()
const
{
return
sizeof
(
mDim
)
+
sizeof
(
mCumType
);
}
void
CumMaxMinPluginDynamic
::
serialize
(
void
*
buffer
)
const
{
serialize_value
(
&
buffer
,
mDim
);
serialize_value
(
&
buffer
,
mCumType
);
}
void
CumMaxMinPluginDynamic
::
destroy
()
{
// This gets called when the network containing plugin is destroyed
delete
this
;
}
void
CumMaxMinPluginDynamic
::
setPluginNamespace
(
const
char
*
libNamespace
)
{
mNamespace
=
libNamespace
;
}
const
char
*
CumMaxMinPluginDynamic
::
getPluginNamespace
()
const
{
return
mNamespace
.
c_str
();
}
CumMaxMinPluginDynamicCreator
::
CumMaxMinPluginDynamicCreator
(
TRT_CUMCMPTYPE
cumType
)
:
mCumType
(
cumType
)
{
mPluginAttributes
.
clear
();
mPluginAttributes
.
emplace_back
(
nvinfer1
::
PluginField
(
"dim"
));
mFC
.
nbFields
=
mPluginAttributes
.
size
();
mFC
.
fields
=
mPluginAttributes
.
data
();
}
const
char
*
CumMaxMinPluginDynamicCreator
::
getPluginName
()
const
{
return
CUMMAXMIN_PLUGIN_NAME
;
}
const
char
*
CumMaxMinPluginDynamicCreator
::
getPluginVersion
()
const
{
return
PLUGIN_VERSION
;
}
const
nvinfer1
::
PluginFieldCollection
*
CumMaxMinPluginDynamicCreator
::
getFieldNames
()
{
return
&
mFC
;
}
nvinfer1
::
IPluginV2
*
CumMaxMinPluginDynamicCreator
::
createPlugin
(
const
char
*
name
,
const
nvinfer1
::
PluginFieldCollection
*
fc
)
{
int
dim
=
0
;
for
(
int
i
=
0
;
i
<
fc
->
nbFields
;
i
++
)
{
if
(
fc
->
fields
[
i
].
data
==
nullptr
)
{
continue
;
}
std
::
string
field_name
(
fc
->
fields
[
i
].
name
);
if
(
field_name
.
compare
(
"dim"
)
==
0
)
{
dim
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
0
];
}
}
CumMaxMinPluginDynamic
*
plugin
=
new
CumMaxMinPluginDynamic
(
name
,
dim
,
mCumType
);
plugin
->
setPluginNamespace
(
getPluginNamespace
());
return
plugin
;
}
nvinfer1
::
IPluginV2
*
CumMaxMinPluginDynamicCreator
::
deserializePlugin
(
const
char
*
name
,
const
void
*
serialData
,
size_t
serialLength
)
{
// This object will be deleted when the network is destroyed, which will
// call FCPluginDynamic::destroy()
auto
plugin
=
new
CumMaxMinPluginDynamic
(
name
,
serialData
,
serialLength
);
plugin
->
setPluginNamespace
(
getPluginNamespace
());
return
plugin
;
}
void
CumMaxMinPluginDynamicCreator
::
setPluginNamespace
(
const
char
*
libNamespace
)
{
mNamespace
=
libNamespace
;
}
const
char
*
CumMaxMinPluginDynamicCreator
::
getPluginNamespace
()
const
{
return
mNamespace
.
c_str
();
}
CumMaxPluginDynamicCreator
::
CumMaxPluginDynamicCreator
()
:
CumMaxMinPluginDynamicCreator
(
TRT_CUMCMPTYPE
::
TRT_CUMMAX
)
{}
const
char
*
CumMaxPluginDynamicCreator
::
getPluginName
()
const
{
return
CUMMAX_PLUGIN_NAME
;
}
CumMinPluginDynamicCreator
::
CumMinPluginDynamicCreator
()
:
CumMaxMinPluginDynamicCreator
(
TRT_CUMCMPTYPE
::
TRT_CUMMIN
)
{}
const
char
*
CumMinPluginDynamicCreator
::
getPluginName
()
const
{
return
CUMMIN_PLUGIN_NAME
;
}
mmcv/ops/csrc/tensorrt/plugins/trt_cummaxmin_kernel.cu
deleted
100644 → 0
View file @
961373ad
// Copyright (c) OpenMMLab. All rights reserved
#include "common_cuda_helper.hpp"
#include "trt_cuda_helper.cuh"
#include "trt_plugin_helper.hpp"
using
mmcv
::
TensorDesc
;
template
<
typename
scalar_t
>
__global__
void
cummaxmin_kernel
(
const
scalar_t
*
input
,
scalar_t
*
output_value
,
int
*
output_index
,
TensorDesc
tensor_desc
,
int
cum_dim
,
int
cum_type
)
{
const
size_t
cum_size
=
tensor_desc
.
shape
[
cum_dim
];
const
size_t
cum_stride
=
tensor_desc
.
stride
[
cum_dim
];
const
size_t
data_size
=
tensor_desc
.
stride
[
0
]
*
tensor_desc
.
shape
[
0
]
/
cum_size
;
CUDA_1D_KERNEL_LOOP
(
index
,
data_size
)
{
size_t
cum_offset
=
index
/
cum_stride
*
(
cum_size
*
cum_stride
)
+
index
%
cum_stride
;
int
cum_index
=
0
;
auto
cum_value
=
input
[
cum_offset
];
output_value
[
cum_offset
]
=
cum_value
;
output_index
[
cum_offset
]
=
cum_index
;
for
(
size_t
cum_index_current
=
1
;
cum_index_current
<
cum_size
;
++
cum_index_current
)
{
cum_offset
+=
cum_stride
;
const
auto
cum_value_current
=
input
[
cum_offset
];
switch
(
cum_type
)
{
case
0
:
// max
if
(
cum_value_current
>
cum_value
)
{
cum_value
=
cum_value_current
;
cum_index
=
cum_index_current
;
}
break
;
case
1
:
// min
if
(
cum_value_current
<
cum_value
)
{
cum_value
=
cum_value_current
;
cum_index
=
cum_index_current
;
}
break
;
}
output_value
[
cum_offset
]
=
cum_value
;
output_index
[
cum_offset
]
=
cum_index
;
}
}
}
template
<
typename
scalar_t
>
void
CumMaxMinForwardLauncher
(
const
scalar_t
*
input
,
scalar_t
*
output_value
,
int
*
output_index
,
const
int
*
dims
,
int
nbDims
,
int
cum_dim
,
int
cum_type
,
cudaStream_t
stream
)
{
// fill tensordesc and initial
TensorDesc
tensor_desc
;
memset
((
void
*
)
&
tensor_desc
,
0
,
sizeof
(
TensorDesc
));
tensor_desc
.
dim
=
nbDims
;
tensor_desc
.
shape
[
nbDims
-
1
]
=
dims
[
nbDims
-
1
];
tensor_desc
.
stride
[
nbDims
-
1
]
=
1
;
for
(
int
i
=
nbDims
-
2
;
i
>=
0
;
--
i
)
{
tensor_desc
.
shape
[
i
]
=
dims
[
i
];
tensor_desc
.
stride
[
i
]
=
dims
[
i
+
1
]
*
tensor_desc
.
stride
[
i
+
1
];
}
// cum dim should be larger than 0
cum_dim
=
cum_dim
>=
0
?
cum_dim
:
(
nbDims
+
cum_dim
);
const
int
data_size
=
tensor_desc
.
stride
[
0
]
*
tensor_desc
.
shape
[
0
]
/
tensor_desc
.
shape
[
cum_dim
];
const
int
col_block
=
GET_BLOCKS
(
data_size
,
THREADS_PER_BLOCK
);
cummaxmin_kernel
<
scalar_t
><<<
col_block
,
THREADS_PER_BLOCK
,
0
,
stream
>>>
(
input
,
output_value
,
output_index
,
tensor_desc
,
cum_dim
,
cum_type
);
}
void
CumMaxMinForwardLauncher_float
(
const
float
*
input
,
float
*
output_value
,
int
*
output_index
,
const
int
*
dims
,
int
nbDims
,
int
cum_dim
,
int
cum_type
,
cudaStream_t
stream
)
{
CumMaxMinForwardLauncher
<
float
>
(
input
,
output_value
,
output_index
,
dims
,
nbDims
,
cum_dim
,
cum_type
,
stream
);
}
void
CumMaxMinForwardLauncher_int32
(
const
int
*
input
,
int
*
output_value
,
int
*
output_index
,
const
int
*
dims
,
int
nbDims
,
int
cum_dim
,
int
cum_type
,
cudaStream_t
stream
)
{
CumMaxMinForwardLauncher
<
int
>
(
input
,
output_value
,
output_index
,
dims
,
nbDims
,
cum_dim
,
cum_type
,
stream
);
}
mmcv/ops/csrc/tensorrt/plugins/trt_deform_conv.cpp
deleted
100644 → 0
View file @
961373ad
// Copyright (c) OpenMMLab. All rights reserved
#include "trt_deform_conv.hpp"
#include <assert.h>
#include <chrono>
#include "trt_serialize.hpp"
void
DeformConvForwardCUDAKernelLauncher_float
(
const
float
*
input
,
const
float
*
weight
,
const
float
*
offset
,
float
*
output
,
void
*
workspace
,
int
batchSize
,
int
nInputPlane
,
int
inputHeight
,
int
inputWidth
,
int
nOutputPlane
,
int
kW
,
int
kH
,
int
dW
,
int
dH
,
int
padW
,
int
padH
,
int
dilationW
,
int
dilationH
,
int
group
,
int
deformable_group
,
int
im2col_step
,
cublasHandle_t
cublas_handle
,
cudaStream_t
stream
);
namespace
{
static
const
char
*
PLUGIN_VERSION
{
"1"
};
static
const
char
*
PLUGIN_NAME
{
"MMCVDeformConv2d"
};
}
// namespace
nvinfer1
::
PluginFieldCollection
DeformableConvPluginDynamicCreator
::
mFC
{};
std
::
vector
<
nvinfer1
::
PluginField
>
DeformableConvPluginDynamicCreator
::
mPluginAttributes
;
DeformableConvPluginDynamic
::
DeformableConvPluginDynamic
(
const
std
::
string
&
name
,
const
nvinfer1
::
Dims
&
stride
,
const
nvinfer1
::
Dims
&
padding
,
const
nvinfer1
::
Dims
&
dilation
,
const
int
deformableGroup
,
const
int
group
,
int
im2colStep
)
:
mLayerName
(
name
),
mStride
(
stride
),
mPadding
(
padding
),
mDilation
(
dilation
),
mDeformableGroup
(
deformableGroup
),
mGroup
(
group
),
mIm2colStep
(
im2colStep
)
{}
DeformableConvPluginDynamic
::
DeformableConvPluginDynamic
(
const
std
::
string
name
,
const
void
*
data
,
size_t
length
)
:
mLayerName
(
name
)
{
deserialize_value
(
&
data
,
&
length
,
&
mStride
);
deserialize_value
(
&
data
,
&
length
,
&
mPadding
);
deserialize_value
(
&
data
,
&
length
,
&
mDilation
);
deserialize_value
(
&
data
,
&
length
,
&
mDeformableGroup
);
deserialize_value
(
&
data
,
&
length
,
&
mGroup
);
deserialize_value
(
&
data
,
&
length
,
&
mIm2colStep
);
}
DeformableConvPluginDynamic
::~
DeformableConvPluginDynamic
()
{}
nvinfer1
::
IPluginV2DynamicExt
*
DeformableConvPluginDynamic
::
clone
()
const
{
DeformableConvPluginDynamic
*
plugin
=
new
DeformableConvPluginDynamic
(
mLayerName
,
mStride
,
mPadding
,
mDilation
,
mDeformableGroup
,
mGroup
,
mIm2colStep
);
plugin
->
setPluginNamespace
(
getPluginNamespace
());
return
plugin
;
}
nvinfer1
::
DimsExprs
DeformableConvPluginDynamic
::
getOutputDimensions
(
int
outputIndex
,
const
nvinfer1
::
DimsExprs
*
inputs
,
int
nbInputs
,
nvinfer1
::
IExprBuilder
&
exprBuilder
)
{
nvinfer1
::
DimsExprs
ret
;
ret
.
nbDims
=
4
;
ret
.
d
[
0
]
=
inputs
[
0
].
d
[
0
];
ret
.
d
[
1
]
=
inputs
[
2
].
d
[
0
];
ret
.
d
[
2
]
=
inputs
[
1
].
d
[
2
];
ret
.
d
[
3
]
=
inputs
[
1
].
d
[
3
];
return
ret
;
}
bool
DeformableConvPluginDynamic
::
supportsFormatCombination
(
int
pos
,
const
nvinfer1
::
PluginTensorDesc
*
inOut
,
int
nbInputs
,
int
nbOutputs
)
{
if
(
pos
==
0
)
{
return
(
inOut
[
pos
].
type
==
nvinfer1
::
DataType
::
kFLOAT
&&
inOut
[
pos
].
format
==
nvinfer1
::
TensorFormat
::
kLINEAR
);
}
else
{
return
inOut
[
pos
].
type
==
inOut
[
0
].
type
&&
inOut
[
pos
].
format
==
inOut
[
0
].
format
;
}
}
void
DeformableConvPluginDynamic
::
configurePlugin
(
const
nvinfer1
::
DynamicPluginTensorDesc
*
inputs
,
int
nbInputs
,
const
nvinfer1
::
DynamicPluginTensorDesc
*
outputs
,
int
nbOutputs
)
{}
size_t
DeformableConvPluginDynamic
::
getWorkspaceSize
(
const
nvinfer1
::
PluginTensorDesc
*
inputs
,
int
nbInputs
,
const
nvinfer1
::
PluginTensorDesc
*
outputs
,
int
nbOutputs
)
const
{
int
sizeof_dtype
=
mmcv
::
getElementSize
(
outputs
[
0
].
type
);
int
batch_size
=
inputs
[
0
].
dims
.
d
[
0
];
int
nInputPlane
=
inputs
[
0
].
dims
.
d
[
1
];
int
inputHeight
=
inputs
[
0
].
dims
.
d
[
2
];
int
inputWidth
=
inputs
[
0
].
dims
.
d
[
3
];
int
nOutputPlane
=
outputs
[
0
].
dims
.
d
[
1
];
int
outputHeight
=
outputs
[
0
].
dims
.
d
[
2
];
int
outputWidth
=
outputs
[
0
].
dims
.
d
[
3
];
int
kW
=
inputs
[
2
].
dims
.
d
[
2
];
int
kH
=
inputs
[
2
].
dims
.
d
[
3
];
int
im2col_step
=
std
::
min
(
batch_size
,
mIm2colStep
);
size_t
col_size
=
mmcv
::
getAlignedSize
(
nInputPlane
*
kW
*
kH
*
im2col_step
*
outputHeight
*
outputWidth
*
sizeof_dtype
);
size_t
out_size
=
0
;
if
(
im2col_step
!=
1
)
out_size
=
mmcv
::
getAlignedSize
(
batch_size
*
nOutputPlane
*
outputHeight
*
outputWidth
*
sizeof_dtype
);
return
col_size
+
out_size
;
}
int
DeformableConvPluginDynamic
::
enqueue
(
const
nvinfer1
::
PluginTensorDesc
*
inputDesc
,
const
nvinfer1
::
PluginTensorDesc
*
outputDesc
,
const
void
*
const
*
inputs
,
void
*
const
*
outputs
,
void
*
workSpace
,
cudaStream_t
stream
)
{
int
batch_size
=
inputDesc
[
0
].
dims
.
d
[
0
];
int
inputChannel
=
inputDesc
[
0
].
dims
.
d
[
1
];
int
inputHeight
=
inputDesc
[
0
].
dims
.
d
[
2
];
int
inputWidth
=
inputDesc
[
0
].
dims
.
d
[
3
];
int
outputChannel
=
outputDesc
[
0
].
dims
.
d
[
1
];
int
kernelHeight
=
inputDesc
[
2
].
dims
.
d
[
2
];
int
kernelWidth
=
inputDesc
[
2
].
dims
.
d
[
3
];
const
void
*
x
=
inputs
[
0
];
const
void
*
offset
=
inputs
[
1
];
const
void
*
weight
=
inputs
[
2
];
void
*
output
=
outputs
[
0
];
int
im2col_step
=
std
::
min
(
batch_size
,
mIm2colStep
);
// TODO: add fp16 support
auto
data_type
=
inputDesc
[
0
].
type
;
switch
(
data_type
)
{
case
nvinfer1
::
DataType
::
kFLOAT
:
DeformConvForwardCUDAKernelLauncher_float
(
(
float
*
)
x
,
(
float
*
)
weight
,
(
float
*
)
offset
,
(
float
*
)
output
,
workSpace
,
batch_size
,
inputChannel
,
inputHeight
,
inputWidth
,
outputChannel
,
kernelWidth
,
kernelHeight
,
mStride
.
d
[
0
],
mStride
.
d
[
1
],
mPadding
.
d
[
0
],
mPadding
.
d
[
1
],
mDilation
.
d
[
0
],
mDilation
.
d
[
1
],
mGroup
,
mDeformableGroup
,
im2col_step
,
m_cublas_handle
,
stream
);
break
;
default:
return
1
;
break
;
}
return
0
;
}
nvinfer1
::
DataType
DeformableConvPluginDynamic
::
getOutputDataType
(
int
index
,
const
nvinfer1
::
DataType
*
inputTypes
,
int
nbInputs
)
const
{
return
inputTypes
[
0
];
}
// IPluginV2 Methods
const
char
*
DeformableConvPluginDynamic
::
getPluginType
()
const
{
return
PLUGIN_NAME
;
}
const
char
*
DeformableConvPluginDynamic
::
getPluginVersion
()
const
{
return
PLUGIN_VERSION
;
}
int
DeformableConvPluginDynamic
::
getNbOutputs
()
const
{
return
1
;
}
int
DeformableConvPluginDynamic
::
initialize
()
{
return
0
;
}
void
DeformableConvPluginDynamic
::
terminate
()
{}
size_t
DeformableConvPluginDynamic
::
getSerializationSize
()
const
{
return
sizeof
(
mStride
)
+
sizeof
(
mPadding
)
+
sizeof
(
mDilation
)
+
sizeof
(
mDeformableGroup
)
+
sizeof
(
mGroup
)
+
sizeof
(
mIm2colStep
);
}
void
DeformableConvPluginDynamic
::
serialize
(
void
*
buffer
)
const
{
serialize_value
(
&
buffer
,
mStride
);
serialize_value
(
&
buffer
,
mPadding
);
serialize_value
(
&
buffer
,
mDilation
);
serialize_value
(
&
buffer
,
mDeformableGroup
);
serialize_value
(
&
buffer
,
mGroup
);
serialize_value
(
&
buffer
,
mIm2colStep
);
}
void
DeformableConvPluginDynamic
::
destroy
()
{
// This gets called when the network containing plugin is destroyed
delete
this
;
}
void
DeformableConvPluginDynamic
::
attachToContext
(
cudnnContext
*
cudnnContext
,
cublasContext
*
cublasContext
,
nvinfer1
::
IGpuAllocator
*
gpuAllocator
)
{
m_cublas_handle
=
cublasContext
;
}
void
DeformableConvPluginDynamic
::
detachFromContext
()
{}
void
DeformableConvPluginDynamic
::
setPluginNamespace
(
const
char
*
libNamespace
)
{
mNamespace
=
libNamespace
;
}
const
char
*
DeformableConvPluginDynamic
::
getPluginNamespace
()
const
{
return
mNamespace
.
c_str
();
}
////////////////////// creator /////////////////////////////
DeformableConvPluginDynamicCreator
::
DeformableConvPluginDynamicCreator
()
{
mPluginAttributes
.
emplace_back
(
nvinfer1
::
PluginField
(
"stride"
));
mPluginAttributes
.
emplace_back
(
nvinfer1
::
PluginField
(
"padding"
));
mPluginAttributes
.
emplace_back
(
nvinfer1
::
PluginField
(
"dilation"
));
mPluginAttributes
.
emplace_back
(
nvinfer1
::
PluginField
(
"groups"
));
mPluginAttributes
.
emplace_back
(
nvinfer1
::
PluginField
(
"deform_groups"
));
mPluginAttributes
.
emplace_back
(
nvinfer1
::
PluginField
(
"bias"
));
mPluginAttributes
.
emplace_back
(
nvinfer1
::
PluginField
(
"im2col_step"
));
mFC
.
nbFields
=
mPluginAttributes
.
size
();
mFC
.
fields
=
mPluginAttributes
.
data
();
}
const
char
*
DeformableConvPluginDynamicCreator
::
getPluginName
()
const
{
return
PLUGIN_NAME
;
}
const
char
*
DeformableConvPluginDynamicCreator
::
getPluginVersion
()
const
{
return
PLUGIN_VERSION
;
}
const
nvinfer1
::
PluginFieldCollection
*
DeformableConvPluginDynamicCreator
::
getFieldNames
()
{
return
&
mFC
;
}
nvinfer1
::
IPluginV2
*
DeformableConvPluginDynamicCreator
::
createPlugin
(
const
char
*
name
,
const
nvinfer1
::
PluginFieldCollection
*
fc
)
{
nvinfer1
::
Dims
stride
{
2
,
{
1
,
1
}};
nvinfer1
::
Dims
padding
{
2
,
{
0
,
0
}};
nvinfer1
::
Dims
dilation
{
2
,
{
1
,
1
}};
int
deformableGroup
=
1
;
int
group
=
1
;
int
im2col_step
=
32
;
for
(
int
i
=
0
;
i
<
fc
->
nbFields
;
i
++
)
{
if
(
fc
->
fields
[
i
].
data
==
nullptr
)
{
continue
;
}
std
::
string
field_name
(
fc
->
fields
[
i
].
name
);
if
(
field_name
.
compare
(
"stride"
)
==
0
)
{
stride
.
nbDims
=
2
;
stride
.
d
[
0
]
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
0
];
if
(
fc
->
fields
[
i
].
length
==
1
)
{
stride
.
d
[
1
]
=
stride
.
d
[
0
];
}
else
{
stride
.
d
[
1
]
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
1
];
}
}
if
(
field_name
.
compare
(
"padding"
)
==
0
)
{
padding
.
nbDims
=
2
;
padding
.
d
[
0
]
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
0
];
if
(
fc
->
fields
[
i
].
length
==
1
)
{
padding
.
d
[
1
]
=
padding
.
d
[
0
];
}
else
{
padding
.
d
[
1
]
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
1
];
}
}
if
(
field_name
.
compare
(
"dilation"
)
==
0
)
{
dilation
.
nbDims
=
2
;
dilation
.
d
[
0
]
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
0
];
if
(
fc
->
fields
[
i
].
length
==
1
)
{
dilation
.
d
[
1
]
=
dilation
.
d
[
0
];
}
else
{
dilation
.
d
[
1
]
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
1
];
}
}
if
(
field_name
.
compare
(
"deform_groups"
)
==
0
)
{
deformableGroup
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
0
];
}
if
(
field_name
.
compare
(
"group"
)
==
0
)
{
group
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
0
];
}
if
(
field_name
.
compare
(
"im2col_step"
)
==
0
)
{
im2col_step
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
0
];
}
}
DeformableConvPluginDynamic
*
plugin
=
new
DeformableConvPluginDynamic
(
name
,
stride
,
padding
,
dilation
,
deformableGroup
,
group
,
im2col_step
);
plugin
->
setPluginNamespace
(
getPluginNamespace
());
return
plugin
;
}
nvinfer1
::
IPluginV2
*
DeformableConvPluginDynamicCreator
::
deserializePlugin
(
const
char
*
name
,
const
void
*
serialData
,
size_t
serialLength
)
{
auto
plugin
=
new
DeformableConvPluginDynamic
(
name
,
serialData
,
serialLength
);
plugin
->
setPluginNamespace
(
getPluginNamespace
());
return
plugin
;
}
void
DeformableConvPluginDynamicCreator
::
setPluginNamespace
(
const
char
*
libNamespace
)
{
mNamespace
=
libNamespace
;
}
const
char
*
DeformableConvPluginDynamicCreator
::
getPluginNamespace
()
const
{
return
mNamespace
.
c_str
();
}
mmcv/ops/csrc/tensorrt/plugins/trt_deform_conv_kernel.cu
deleted
100644 → 0
View file @
961373ad
// Copyright (c) OpenMMLab. All rights reserved
#include <cuda_fp16.h>
#include "common_cuda_helper.hpp"
#include "deform_conv_cuda_kernel.cuh"
#include "trt_cuda_helper.cuh"
#include "trt_plugin_helper.hpp"
template
<
typename
T
>
void
trt_deformable_im2col
(
const
T
*
data_input
,
const
T
*
data_offset
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
ksize_h
,
const
int
ksize_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
parallel_imgs
,
const
int
deformable_group
,
T
*
data_col
,
cudaStream_t
stream
)
{
int
height_col
=
(
height
+
2
*
pad_h
-
(
dilation_h
*
(
ksize_h
-
1
)
+
1
))
/
stride_h
+
1
;
int
width_col
=
(
width
+
2
*
pad_w
-
(
dilation_w
*
(
ksize_w
-
1
)
+
1
))
/
stride_w
+
1
;
int
num_kernels
=
channels
*
height_col
*
width_col
*
parallel_imgs
;
int
channel_per_deformable_group
=
channels
/
deformable_group
;
deformable_im2col_gpu_kernel
<
T
>
<<<
GET_BLOCKS
(
num_kernels
),
THREADS_PER_BLOCK
,
0
,
stream
>>>
(
num_kernels
,
data_input
,
data_offset
,
height
,
width
,
ksize_h
,
ksize_w
,
pad_h
,
pad_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
channel_per_deformable_group
,
parallel_imgs
,
channels
,
deformable_group
,
height_col
,
width_col
,
data_col
);
cudaCheckError
();
}
template
<
typename
scalar_t
>
void
DeformConvForwardCUDAKernelLauncher
(
const
scalar_t
*
input
,
const
scalar_t
*
weight
,
const
scalar_t
*
offset
,
scalar_t
*
output
,
void
*
workspace
,
int
batchSize
,
int
nInputPlane
,
int
inputHeight
,
int
inputWidth
,
int
nOutputPlane
,
int
kW
,
int
kH
,
int
dW
,
int
dH
,
int
padW
,
int
padH
,
int
dilationW
,
int
dilationH
,
int
group
,
int
deformable_group
,
int
im2col_step
,
cublasHandle_t
cublas_handle
,
cudaStream_t
stream
)
{
size_t
word_size
=
sizeof
(
scalar_t
);
im2col_step
=
std
::
min
(
int
(
batchSize
),
im2col_step
);
long
outputWidth
=
(
inputWidth
+
2
*
padW
-
(
dilationW
*
(
kW
-
1
)
+
1
))
/
dW
+
1
;
long
outputHeight
=
(
inputHeight
+
2
*
padH
-
(
dilationH
*
(
kH
-
1
)
+
1
))
/
dH
+
1
;
long
long
columns_size
=
mmcv
::
getAlignedSize
(
nInputPlane
*
kW
*
kH
*
im2col_step
*
outputHeight
*
outputWidth
*
word_size
);
// column buffer for img2col
scalar_t
*
columns
=
(
scalar_t
*
)
workspace
;
workspace
=
workspace
+
columns_size
;
scalar_t
*
output_buffer
;
long
long
output_buffer_size
=
0
;
if
(
im2col_step
==
1
)
{
output_buffer
=
output
;
}
else
{
// output need permute when im2col_step!=1
output_buffer
=
(
scalar_t
*
)
workspace
;
output_buffer_size
=
batchSize
*
nOutputPlane
*
outputWidth
*
outputHeight
;
}
long
long
input_elt_step
=
im2col_step
*
nInputPlane
*
inputHeight
*
inputWidth
;
long
long
offset_elt_step
=
im2col_step
*
deformable_group
*
2
*
kH
*
kW
*
outputHeight
*
outputWidth
;
long
long
out_buffer_step
=
nOutputPlane
*
im2col_step
*
outputHeight
*
outputWidth
;
long
long
col_g_step
=
nInputPlane
*
kW
*
kH
/
group
*
im2col_step
*
outputHeight
*
outputWidth
;
long
long
weight_g_step
=
nOutputPlane
/
group
*
nInputPlane
/
group
*
kH
*
kW
;
long
long
out_buffer_g_step
=
nOutputPlane
/
group
*
im2col_step
*
outputHeight
*
outputWidth
;
int
m
=
nOutputPlane
/
group
;
int
n
=
im2col_step
*
outputHeight
*
outputWidth
;
int
k
=
nInputPlane
/
group
*
kH
*
kW
;
scalar_t
alpha
=
1.
;
scalar_t
beta
=
0.
;
for
(
int
elt
=
0
;
elt
<
batchSize
/
im2col_step
;
elt
++
)
{
const
scalar_t
*
input_start
=
input
+
elt
*
input_elt_step
;
const
scalar_t
*
offset_start
=
offset
+
elt
*
offset_elt_step
;
trt_deformable_im2col
<
scalar_t
>
(
input_start
,
offset_start
,
nInputPlane
,
inputHeight
,
inputWidth
,
kH
,
kW
,
padH
,
padW
,
dH
,
dW
,
dilationH
,
dilationW
,
im2col_step
,
deformable_group
,
columns
,
stream
);
for
(
int
g
=
0
;
g
<
group
;
++
g
)
{
const
scalar_t
*
weight_start
=
weight
+
g
*
weight_g_step
;
scalar_t
*
col_start
=
columns
+
g
*
col_g_step
;
scalar_t
*
out_buffer_start
=
output_buffer
+
elt
*
out_buffer_step
+
g
*
out_buffer_g_step
;
cublasGemmWrap
<
scalar_t
>
(
cublas_handle
,
CUBLAS_OP_N
,
CUBLAS_OP_N
,
n
,
m
,
k
,
&
alpha
,
col_start
,
n
,
weight_start
,
k
,
&
beta
,
out_buffer_start
,
n
);
cudaCheckError
();
}
}
if
(
im2col_step
!=
1
)
{
int
output_buffer_shape
[
5
]
=
{
batchSize
/
im2col_step
,
nOutputPlane
,
im2col_step
,
outputHeight
,
outputWidth
};
int
output_buffer_permute
[
5
]
=
{
0
,
2
,
1
,
3
,
4
};
memcpyPermute
<
scalar_t
>
(
output
,
output_buffer
,
&
output_buffer_shape
[
0
],
&
output_buffer_permute
[
0
],
5
,
stream
);
}
}
void
DeformConvForwardCUDAKernelLauncher_float
(
const
float
*
input
,
const
float
*
weight
,
const
float
*
offset
,
float
*
output
,
void
*
workspace
,
int
batchSize
,
int
nInputPlane
,
int
inputHeight
,
int
inputWidth
,
int
nOutputPlane
,
int
kW
,
int
kH
,
int
dW
,
int
dH
,
int
padW
,
int
padH
,
int
dilationW
,
int
dilationH
,
int
group
,
int
deformable_group
,
int
im2col_step
,
cublasHandle_t
cublas_handle
,
cudaStream_t
stream
)
{
DeformConvForwardCUDAKernelLauncher
<
float
>
(
input
,
weight
,
offset
,
output
,
workspace
,
batchSize
,
nInputPlane
,
inputHeight
,
inputWidth
,
nOutputPlane
,
kW
,
kH
,
dW
,
dH
,
padW
,
padH
,
dilationW
,
dilationH
,
group
,
deformable_group
,
im2col_step
,
cublas_handle
,
stream
);
}
mmcv/ops/csrc/tensorrt/plugins/trt_grid_sampler.cpp
deleted
100644 → 0
View file @
961373ad
// Copyright (c) OpenMMLab. All rights reserved
#include "trt_grid_sampler.hpp"
#include <assert.h>
#include <stdio.h>
#include <chrono>
#include "trt_serialize.hpp"
using
mmcv
::
GridSamplerInterpolation
;
using
mmcv
::
GridSamplerPadding
;
void
grid_sample_float
(
float
*
output
,
const
float
*
input
,
const
float
*
grid
,
int
*
output_dims
,
int
*
input_dims
,
int
*
grid_dims
,
int
nb_dims
,
GridSamplerInterpolation
interp
,
GridSamplerPadding
padding
,
bool
align_corners
,
cudaStream_t
stream
);
namespace
{
static
const
char
*
PLUGIN_VERSION
{
"1"
};
static
const
char
*
PLUGIN_NAME
{
"grid_sampler"
};
}
// namespace
nvinfer1
::
PluginFieldCollection
GridSamplerDynamicCreator
::
mFC
{};
std
::
vector
<
nvinfer1
::
PluginField
>
GridSamplerDynamicCreator
::
mPluginAttributes
;
GridSamplerDynamic
::
GridSamplerDynamic
(
const
std
::
string
&
name
,
int
mode
,
int
paddingMode
,
bool
alignCorners
)
:
mLayerName
(
name
),
mMode
(
mode
),
mPaddingMode
(
paddingMode
),
mAlignCorners
(
alignCorners
)
{}
GridSamplerDynamic
::
GridSamplerDynamic
(
const
std
::
string
name
,
const
void
*
data
,
size_t
length
)
:
mLayerName
(
name
)
{
deserialize_value
(
&
data
,
&
length
,
&
mMode
);
deserialize_value
(
&
data
,
&
length
,
&
mPaddingMode
);
deserialize_value
(
&
data
,
&
length
,
&
mAlignCorners
);
}
nvinfer1
::
IPluginV2DynamicExt
*
GridSamplerDynamic
::
clone
()
const
{
GridSamplerDynamic
*
plugin
=
new
GridSamplerDynamic
(
mLayerName
,
mMode
,
mPaddingMode
,
mAlignCorners
);
plugin
->
setPluginNamespace
(
getPluginNamespace
());
return
plugin
;
}
nvinfer1
::
DimsExprs
GridSamplerDynamic
::
getOutputDimensions
(
int
outputIndex
,
const
nvinfer1
::
DimsExprs
*
inputs
,
int
nbInputs
,
nvinfer1
::
IExprBuilder
&
exprBuilder
)
{
nvinfer1
::
DimsExprs
ret
;
ret
.
nbDims
=
inputs
[
0
].
nbDims
;
ret
.
d
[
0
]
=
inputs
[
0
].
d
[
0
];
ret
.
d
[
1
]
=
inputs
[
0
].
d
[
1
];
for
(
int
i
=
2
;
i
<
ret
.
nbDims
;
++
i
)
{
ret
.
d
[
i
]
=
inputs
[
1
].
d
[
i
-
1
];
}
return
ret
;
}
bool
GridSamplerDynamic
::
supportsFormatCombination
(
int
pos
,
const
nvinfer1
::
PluginTensorDesc
*
inOut
,
int
nbInputs
,
int
nbOutputs
)
{
if
(
pos
==
0
)
{
return
(
inOut
[
pos
].
type
==
nvinfer1
::
DataType
::
kFLOAT
&&
inOut
[
pos
].
format
==
nvinfer1
::
TensorFormat
::
kLINEAR
);
}
else
{
return
inOut
[
pos
].
type
==
inOut
[
0
].
type
&&
inOut
[
pos
].
format
==
inOut
[
0
].
format
;
}
}
void
GridSamplerDynamic
::
configurePlugin
(
const
nvinfer1
::
DynamicPluginTensorDesc
*
inputs
,
int
nbInputs
,
const
nvinfer1
::
DynamicPluginTensorDesc
*
outputs
,
int
nbOutputs
)
{
// Validate input arguments
}
size_t
GridSamplerDynamic
::
getWorkspaceSize
(
const
nvinfer1
::
PluginTensorDesc
*
inputs
,
int
nbInputs
,
const
nvinfer1
::
PluginTensorDesc
*
outputs
,
int
nbOutputs
)
const
{
return
0
;
}
int
GridSamplerDynamic
::
enqueue
(
const
nvinfer1
::
PluginTensorDesc
*
inputDesc
,
const
nvinfer1
::
PluginTensorDesc
*
outputDesc
,
const
void
*
const
*
inputs
,
void
*
const
*
outputs
,
void
*
workSpace
,
cudaStream_t
stream
)
{
nvinfer1
::
Dims
input_dims
=
inputDesc
[
0
].
dims
;
nvinfer1
::
Dims
grid_dims
=
inputDesc
[
1
].
dims
;
nvinfer1
::
Dims
output_dims
=
outputDesc
[
0
].
dims
;
using
mmcv
::
GridSamplerInterpolation
;
using
mmcv
::
GridSamplerPadding
;
GridSamplerInterpolation
interp_mode
=
GridSamplerInterpolation
::
Bilinear
;
switch
(
mMode
)
{
case
0
:
interp_mode
=
GridSamplerInterpolation
::
Bilinear
;
break
;
case
1
:
interp_mode
=
GridSamplerInterpolation
::
Nearest
;
break
;
default:
break
;
}
GridSamplerPadding
padding_mode
=
GridSamplerPadding
::
Zeros
;
switch
(
mPaddingMode
)
{
case
0
:
padding_mode
=
GridSamplerPadding
::
Zeros
;
break
;
case
1
:
padding_mode
=
GridSamplerPadding
::
Border
;
break
;
case
2
:
padding_mode
=
GridSamplerPadding
::
Reflection
;
break
;
default:
break
;
}
auto
data_type
=
inputDesc
[
0
].
type
;
switch
(
data_type
)
{
case
nvinfer1
::
DataType
::
kFLOAT
:
grid_sample_float
(
(
float
*
)
outputs
[
0
],
(
float
*
)
inputs
[
0
],
(
float
*
)
inputs
[
1
],
&
(
output_dims
.
d
[
0
]),
&
(
input_dims
.
d
[
0
]),
&
(
grid_dims
.
d
[
0
]),
input_dims
.
nbDims
,
interp_mode
,
padding_mode
,
mAlignCorners
,
stream
);
break
;
default:
return
1
;
break
;
}
return
0
;
}
nvinfer1
::
DataType
GridSamplerDynamic
::
getOutputDataType
(
int
index
,
const
nvinfer1
::
DataType
*
inputTypes
,
int
nbInputs
)
const
{
return
inputTypes
[
0
];
}
// IPluginV2 Methods
const
char
*
GridSamplerDynamic
::
getPluginType
()
const
{
return
PLUGIN_NAME
;
}
const
char
*
GridSamplerDynamic
::
getPluginVersion
()
const
{
return
PLUGIN_VERSION
;
}
int
GridSamplerDynamic
::
getNbOutputs
()
const
{
return
1
;
}
int
GridSamplerDynamic
::
initialize
()
{
return
0
;
}
void
GridSamplerDynamic
::
terminate
()
{}
size_t
GridSamplerDynamic
::
getSerializationSize
()
const
{
return
sizeof
(
mMode
)
+
sizeof
(
mPaddingMode
)
+
sizeof
(
mAlignCorners
);
}
void
GridSamplerDynamic
::
serialize
(
void
*
buffer
)
const
{
serialize_value
(
&
buffer
,
mMode
);
serialize_value
(
&
buffer
,
mPaddingMode
);
serialize_value
(
&
buffer
,
mAlignCorners
);
}
void
GridSamplerDynamic
::
destroy
()
{
// This gets called when the network containing plugin is destroyed
delete
this
;
}
void
GridSamplerDynamic
::
setPluginNamespace
(
const
char
*
libNamespace
)
{
mNamespace
=
libNamespace
;
}
const
char
*
GridSamplerDynamic
::
getPluginNamespace
()
const
{
return
mNamespace
.
c_str
();
}
////////////////////// creator /////////////////////////////
GridSamplerDynamicCreator
::
GridSamplerDynamicCreator
()
{
mPluginAttributes
.
clear
();
mPluginAttributes
.
emplace_back
(
nvinfer1
::
PluginField
(
"interpolation_mode"
));
mPluginAttributes
.
emplace_back
(
nvinfer1
::
PluginField
(
"padding_mode"
));
mPluginAttributes
.
emplace_back
(
nvinfer1
::
PluginField
(
"align_corners"
));
mFC
.
nbFields
=
mPluginAttributes
.
size
();
mFC
.
fields
=
mPluginAttributes
.
data
();
}
const
char
*
GridSamplerDynamicCreator
::
getPluginName
()
const
{
return
PLUGIN_NAME
;
}
const
char
*
GridSamplerDynamicCreator
::
getPluginVersion
()
const
{
return
PLUGIN_VERSION
;
}
const
nvinfer1
::
PluginFieldCollection
*
GridSamplerDynamicCreator
::
getFieldNames
()
{
return
&
mFC
;
}
nvinfer1
::
IPluginV2
*
GridSamplerDynamicCreator
::
createPlugin
(
const
char
*
name
,
const
nvinfer1
::
PluginFieldCollection
*
fc
)
{
int
mode
=
0
;
int
paddingMode
=
0
;
bool
alignCorners
=
false
;
for
(
int
i
=
0
;
i
<
fc
->
nbFields
;
i
++
)
{
if
(
fc
->
fields
[
i
].
data
==
nullptr
)
{
continue
;
}
std
::
string
field_name
(
fc
->
fields
[
i
].
name
);
if
(
field_name
.
compare
(
"interpolation_mode"
)
==
0
)
{
mode
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
0
];
}
if
(
field_name
.
compare
(
"padding_mode"
)
==
0
)
{
paddingMode
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
0
];
}
if
(
field_name
.
compare
(
"align_corners"
)
==
0
)
{
alignCorners
=
(
bool
)(
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
0
]);
}
}
GridSamplerDynamic
*
plugin
=
new
GridSamplerDynamic
(
name
,
mode
,
paddingMode
,
alignCorners
);
plugin
->
setPluginNamespace
(
getPluginNamespace
());
return
plugin
;
}
nvinfer1
::
IPluginV2
*
GridSamplerDynamicCreator
::
deserializePlugin
(
const
char
*
name
,
const
void
*
serialData
,
size_t
serialLength
)
{
// This object will be deleted when the network is destroyed, which will
// call FCPluginDynamic::destroy()
auto
plugin
=
new
GridSamplerDynamic
(
name
,
serialData
,
serialLength
);
plugin
->
setPluginNamespace
(
getPluginNamespace
());
return
plugin
;
}
void
GridSamplerDynamicCreator
::
setPluginNamespace
(
const
char
*
libNamespace
)
{
mNamespace
=
libNamespace
;
}
const
char
*
GridSamplerDynamicCreator
::
getPluginNamespace
()
const
{
return
mNamespace
.
c_str
();
}
mmcv/ops/csrc/tensorrt/plugins/trt_grid_sampler_kernel.cu
deleted
100644 → 0
View file @
961373ad
// Copyright (c) OpenMMLab. All rights reserved
// modified from
// https://github.com/pytorch/pytorch/blob/ec683299ebabf297a3504c76248d37be830e4342/aten/src/ATen/native/cuda/GridSampler.cuh
// and
// https://github.com/pytorch/pytorch/blob/ec683299ebabf297a3504c76248d37be830e4342/aten/src/ATen/native/cuda/GridSampler.cu
#include <cuda_fp16.h>
#include <stdio.h>
#include <algorithm>
#include <cmath>
#include <vector>
#include "common_cuda_helper.hpp"
#include "trt_cuda_helper.cuh"
#include "trt_grid_sampler.hpp"
#include "trt_plugin_helper.hpp"
using
mmcv
::
GridSamplerInterpolation
;
using
mmcv
::
GridSamplerPadding
;
using
mmcv
::
TensorDesc
;
// Unnormalizes a coordinate from the -1 to +1 scale to its pixel index value,
// where we view each pixel as an area between (idx - 0.5) and (idx + 0.5).
// if align_corners: -1 and +1 get sent to the centers of the corner pixels
// -1 --> 0
// +1 --> (size - 1)
// scale_factor = (size - 1) / 2
// if not align_corners: -1 and +1 get sent to the image edges
// -1 --> -0.5
// +1 --> (size - 1) + 0.5 == size - 0.5
// scale_factor = size / 2
template
<
typename
scalar_t
>
static
__forceinline__
__device__
scalar_t
grid_sampler_unnormalize
(
scalar_t
coord
,
int
size
,
bool
align_corners
)
{
if
(
align_corners
)
{
// unnormalize coord from [-1, 1] to [0, size - 1]
return
((
coord
+
1.
f
)
/
2
)
*
(
size
-
1
);
}
else
{
// unnormalize coord from [-1, 1] to [-0.5, size - 0.5]
return
((
coord
+
1.
f
)
*
size
-
1
)
/
2
;
}
}
// Clips coordinates to between 0 and clip_limit - 1
template
<
typename
scalar_t
>
static
__forceinline__
__device__
scalar_t
clip_coordinates
(
scalar_t
in
,
int
clip_limit
)
{
return
::
min
(
static_cast
<
scalar_t
>
(
clip_limit
-
1
),
::
max
(
in
,
static_cast
<
scalar_t
>
(
0
)));
}
// Reflects coordinates until they fall between low and high (inclusive).
// The bounds are passed as twice their value so that half-integer values
// can be represented as ints.
template
<
typename
scalar_t
>
static
__forceinline__
__device__
scalar_t
reflect_coordinates
(
scalar_t
in
,
int
twice_low
,
int
twice_high
)
{
if
(
twice_low
==
twice_high
)
{
return
static_cast
<
scalar_t
>
(
0
);
}
scalar_t
min
=
static_cast
<
scalar_t
>
(
twice_low
)
/
2
;
scalar_t
span
=
static_cast
<
scalar_t
>
(
twice_high
-
twice_low
)
/
2
;
in
=
::
fabs
(
in
-
min
);
// `fmod` returns same sign as `in`, which is positive after the `fabs` above.
scalar_t
extra
=
::
fmod
(
in
,
span
);
int
flips
=
static_cast
<
int
>
(
::
floor
(
in
/
span
));
if
(
flips
%
2
==
0
)
{
return
extra
+
min
;
}
else
{
return
span
-
extra
+
min
;
}
}
template
<
typename
scalar_t
>
static
__forceinline__
__device__
scalar_t
safe_downgrade_to_int_range
(
scalar_t
x
)
{
// -100.0 does not have special meaning. This is just to make sure
// it's not within_bounds_2d or within_bounds_3d, and does not cause
// undefined behavior. See #35506.
if
(
x
>
INT_MAX
-
1
||
x
<
INT_MIN
||
!::
isfinite
(
static_cast
<
double
>
(
x
)))
return
static_cast
<
scalar_t
>
(
-
100.0
);
return
x
;
}
// Computes the pixel source index value for a grid coordinate
template
<
typename
scalar_t
>
static
__forceinline__
__device__
scalar_t
grid_sampler_compute_source_index
(
scalar_t
coord
,
int
size
,
GridSamplerPadding
padding_mode
,
bool
align_corners
)
{
coord
=
grid_sampler_unnormalize
(
coord
,
size
,
align_corners
);
if
(
padding_mode
==
GridSamplerPadding
::
Border
)
{
// clip coordinates to image borders
coord
=
clip_coordinates
(
coord
,
size
);
}
else
if
(
padding_mode
==
GridSamplerPadding
::
Reflection
)
{
// reflect coordinates by image borders
if
(
align_corners
)
{
coord
=
reflect_coordinates
(
coord
,
0
,
2
*
(
size
-
1
));
}
else
{
coord
=
reflect_coordinates
(
coord
,
-
1
,
2
*
size
-
1
);
}
// clip coordinates to image borders
coord
=
clip_coordinates
(
coord
,
size
);
}
coord
=
safe_downgrade_to_int_range
(
coord
);
return
coord
;
}
static
__forceinline__
__device__
bool
within_bounds_2d
(
int
h
,
int
w
,
int
H
,
int
W
)
{
return
h
>=
0
&&
h
<
H
&&
w
>=
0
&&
w
<
W
;
}
static
__forceinline__
__device__
bool
within_bounds_3d
(
int
d
,
int
h
,
int
w
,
int
D
,
int
H
,
int
W
)
{
return
d
>=
0
&&
d
<
D
&&
h
>=
0
&&
h
<
H
&&
w
>=
0
&&
w
<
W
;
}
template
<
typename
scalar_t
>
__global__
void
grid_sampler_2d_kernel
(
const
int
nthreads
,
const
scalar_t
*
input
,
const
scalar_t
*
grid
,
scalar_t
*
output
,
TensorDesc
input_desc
,
TensorDesc
grid_desc
,
TensorDesc
output_desc
,
const
GridSamplerInterpolation
interpolation_mode
,
const
GridSamplerPadding
padding_mode
,
bool
align_corners
)
{
int
C
=
input_desc
.
shape
[
1
];
int
inp_H
=
input_desc
.
shape
[
2
];
int
inp_W
=
input_desc
.
shape
[
3
];
int
out_H
=
grid_desc
.
shape
[
1
];
int
out_W
=
grid_desc
.
shape
[
2
];
int
inp_sN
=
input_desc
.
stride
[
0
];
int
inp_sC
=
input_desc
.
stride
[
1
];
int
inp_sH
=
input_desc
.
stride
[
2
];
int
inp_sW
=
input_desc
.
stride
[
3
];
int
grid_sN
=
grid_desc
.
stride
[
0
];
int
grid_sH
=
grid_desc
.
stride
[
1
];
int
grid_sW
=
grid_desc
.
stride
[
2
];
int
grid_sCoor
=
grid_desc
.
stride
[
3
];
int
out_sN
=
output_desc
.
stride
[
0
];
int
out_sC
=
output_desc
.
stride
[
1
];
int
out_sH
=
output_desc
.
stride
[
2
];
int
out_sW
=
output_desc
.
stride
[
3
];
CUDA_1D_KERNEL_LOOP
(
index
,
nthreads
)
{
const
int
w
=
index
%
out_W
;
const
int
h
=
(
index
/
out_W
)
%
out_H
;
const
int
n
=
index
/
(
out_H
*
out_W
);
const
int
grid_offset
=
n
*
grid_sN
+
h
*
grid_sH
+
w
*
grid_sW
;
// get the corresponding input x, y coordinates from grid
scalar_t
ix
=
grid
[
grid_offset
];
scalar_t
iy
=
grid
[
grid_offset
+
grid_sCoor
];
ix
=
grid_sampler_compute_source_index
(
ix
,
inp_W
,
padding_mode
,
align_corners
);
iy
=
grid_sampler_compute_source_index
(
iy
,
inp_H
,
padding_mode
,
align_corners
);
if
(
interpolation_mode
==
GridSamplerInterpolation
::
Bilinear
)
{
// get NE, NW, SE, SW pixel values from (x, y)
int
ix_nw
=
static_cast
<
int
>
(
::
floor
(
ix
));
int
iy_nw
=
static_cast
<
int
>
(
::
floor
(
iy
));
int
ix_ne
=
ix_nw
+
1
;
int
iy_ne
=
iy_nw
;
int
ix_sw
=
ix_nw
;
int
iy_sw
=
iy_nw
+
1
;
int
ix_se
=
ix_nw
+
1
;
int
iy_se
=
iy_nw
+
1
;
// get surfaces to each neighbor:
scalar_t
nw
=
(
ix_se
-
ix
)
*
(
iy_se
-
iy
);
scalar_t
ne
=
(
ix
-
ix_sw
)
*
(
iy_sw
-
iy
);
scalar_t
sw
=
(
ix_ne
-
ix
)
*
(
iy
-
iy_ne
);
scalar_t
se
=
(
ix
-
ix_nw
)
*
(
iy
-
iy_nw
);
// calculate bilinear weighted pixel value and set output pixel
auto
inp_ptr_NC
=
input
+
n
*
inp_sN
;
auto
out_ptr_NCHW
=
output
+
n
*
out_sN
+
h
*
out_sH
+
w
*
out_sW
;
for
(
int
c
=
0
;
c
<
C
;
++
c
,
inp_ptr_NC
+=
inp_sC
,
out_ptr_NCHW
+=
out_sC
)
{
*
out_ptr_NCHW
=
static_cast
<
scalar_t
>
(
0
);
if
(
within_bounds_2d
(
iy_nw
,
ix_nw
,
inp_H
,
inp_W
))
{
*
out_ptr_NCHW
+=
inp_ptr_NC
[
iy_nw
*
inp_sH
+
ix_nw
*
inp_sW
]
*
nw
;
}
if
(
within_bounds_2d
(
iy_ne
,
ix_ne
,
inp_H
,
inp_W
))
{
*
out_ptr_NCHW
+=
inp_ptr_NC
[
iy_ne
*
inp_sH
+
ix_ne
*
inp_sW
]
*
ne
;
}
if
(
within_bounds_2d
(
iy_sw
,
ix_sw
,
inp_H
,
inp_W
))
{
*
out_ptr_NCHW
+=
inp_ptr_NC
[
iy_sw
*
inp_sH
+
ix_sw
*
inp_sW
]
*
sw
;
}
if
(
within_bounds_2d
(
iy_se
,
ix_se
,
inp_H
,
inp_W
))
{
*
out_ptr_NCHW
+=
inp_ptr_NC
[
iy_se
*
inp_sH
+
ix_se
*
inp_sW
]
*
se
;
}
}
}
else
if
(
interpolation_mode
==
GridSamplerInterpolation
::
Nearest
)
{
int
ix_nearest
=
static_cast
<
int
>
(
::
round
(
ix
));
int
iy_nearest
=
static_cast
<
int
>
(
::
round
(
iy
));
// assign nearest neighbor pixel value to output pixel
auto
inp_ptr_NC
=
input
+
n
*
inp_sN
;
auto
out_ptr_NCHW
=
output
+
n
*
out_sN
+
h
*
out_sH
+
w
*
out_sW
;
for
(
int
c
=
0
;
c
<
C
;
++
c
,
inp_ptr_NC
+=
inp_sC
,
out_ptr_NCHW
+=
out_sC
)
{
if
(
within_bounds_2d
(
iy_nearest
,
ix_nearest
,
inp_H
,
inp_W
))
{
*
out_ptr_NCHW
=
inp_ptr_NC
[
iy_nearest
*
inp_sH
+
ix_nearest
*
inp_sW
];
}
else
{
*
out_ptr_NCHW
=
static_cast
<
scalar_t
>
(
0
);
}
}
}
}
}
template
<
typename
scalar_t
>
__global__
void
grid_sampler_3d_kernel
(
const
int
nthreads
,
const
scalar_t
*
input
,
const
scalar_t
*
grid
,
scalar_t
*
output
,
TensorDesc
input_desc
,
TensorDesc
grid_desc
,
TensorDesc
output_desc
,
const
GridSamplerInterpolation
interpolation_mode
,
const
GridSamplerPadding
padding_mode
,
bool
align_corners
)
{
int
C
=
input_desc
.
shape
[
1
];
int
inp_D
=
input_desc
.
shape
[
2
];
int
inp_H
=
input_desc
.
shape
[
3
];
int
inp_W
=
input_desc
.
shape
[
4
];
int
out_D
=
grid_desc
.
shape
[
1
];
int
out_H
=
grid_desc
.
shape
[
2
];
int
out_W
=
grid_desc
.
shape
[
3
];
int
inp_sN
=
input_desc
.
stride
[
0
];
int
inp_sC
=
input_desc
.
stride
[
1
];
int
inp_sD
=
input_desc
.
stride
[
2
];
int
inp_sH
=
input_desc
.
stride
[
3
];
int
inp_sW
=
input_desc
.
stride
[
4
];
int
grid_sN
=
grid_desc
.
stride
[
0
];
int
grid_sD
=
grid_desc
.
stride
[
1
];
int
grid_sH
=
grid_desc
.
stride
[
2
];
int
grid_sW
=
grid_desc
.
stride
[
3
];
int
grid_sCoor
=
grid_desc
.
stride
[
4
];
int
out_sN
=
output_desc
.
stride
[
0
];
int
out_sC
=
output_desc
.
stride
[
1
];
int
out_sD
=
output_desc
.
stride
[
2
];
int
out_sH
=
output_desc
.
stride
[
3
];
int
out_sW
=
output_desc
.
stride
[
4
];
CUDA_1D_KERNEL_LOOP
(
index
,
nthreads
)
{
const
int
w
=
index
%
out_W
;
const
int
h
=
(
index
/
out_W
)
%
out_H
;
const
int
d
=
(
index
/
(
out_H
*
out_W
))
%
out_D
;
const
int
n
=
index
/
(
out_D
*
out_H
*
out_W
);
const
int
grid_offset
=
n
*
grid_sN
+
d
*
grid_sD
+
h
*
grid_sH
+
w
*
grid_sW
;
// get the corresponding input x, y, z coordinates from grid
scalar_t
ix
=
grid
[
grid_offset
];
scalar_t
iy
=
grid
[
grid_offset
+
grid_sCoor
];
scalar_t
iz
=
grid
[
grid_offset
+
2
*
grid_sCoor
];
ix
=
grid_sampler_compute_source_index
(
ix
,
inp_W
,
padding_mode
,
align_corners
);
iy
=
grid_sampler_compute_source_index
(
iy
,
inp_H
,
padding_mode
,
align_corners
);
iz
=
grid_sampler_compute_source_index
(
iz
,
inp_D
,
padding_mode
,
align_corners
);
if
(
interpolation_mode
==
GridSamplerInterpolation
::
Bilinear
)
{
// get corner pixel values from (x, y, z)
// for 4d, we used north-east-south-west
// for 5d, we add top-bottom
int
ix_tnw
=
static_cast
<
int
>
(
::
floor
(
ix
));
int
iy_tnw
=
static_cast
<
int
>
(
::
floor
(
iy
));
int
iz_tnw
=
static_cast
<
int
>
(
::
floor
(
iz
));
int
ix_tne
=
ix_tnw
+
1
;
int
iy_tne
=
iy_tnw
;
int
iz_tne
=
iz_tnw
;
int
ix_tsw
=
ix_tnw
;
int
iy_tsw
=
iy_tnw
+
1
;
int
iz_tsw
=
iz_tnw
;
int
ix_tse
=
ix_tnw
+
1
;
int
iy_tse
=
iy_tnw
+
1
;
int
iz_tse
=
iz_tnw
;
int
ix_bnw
=
ix_tnw
;
int
iy_bnw
=
iy_tnw
;
int
iz_bnw
=
iz_tnw
+
1
;
int
ix_bne
=
ix_tnw
+
1
;
int
iy_bne
=
iy_tnw
;
int
iz_bne
=
iz_tnw
+
1
;
int
ix_bsw
=
ix_tnw
;
int
iy_bsw
=
iy_tnw
+
1
;
int
iz_bsw
=
iz_tnw
+
1
;
int
ix_bse
=
ix_tnw
+
1
;
int
iy_bse
=
iy_tnw
+
1
;
int
iz_bse
=
iz_tnw
+
1
;
// get surfaces to each neighbor:
scalar_t
tnw
=
(
ix_bse
-
ix
)
*
(
iy_bse
-
iy
)
*
(
iz_bse
-
iz
);
scalar_t
tne
=
(
ix
-
ix_bsw
)
*
(
iy_bsw
-
iy
)
*
(
iz_bsw
-
iz
);
scalar_t
tsw
=
(
ix_bne
-
ix
)
*
(
iy
-
iy_bne
)
*
(
iz_bne
-
iz
);
scalar_t
tse
=
(
ix
-
ix_bnw
)
*
(
iy
-
iy_bnw
)
*
(
iz_bnw
-
iz
);
scalar_t
bnw
=
(
ix_tse
-
ix
)
*
(
iy_tse
-
iy
)
*
(
iz
-
iz_tse
);
scalar_t
bne
=
(
ix
-
ix_tsw
)
*
(
iy_tsw
-
iy
)
*
(
iz
-
iz_tsw
);
scalar_t
bsw
=
(
ix_tne
-
ix
)
*
(
iy
-
iy_tne
)
*
(
iz
-
iz_tne
);
scalar_t
bse
=
(
ix
-
ix_tnw
)
*
(
iy
-
iy_tnw
)
*
(
iz
-
iz_tnw
);
auto
inp_ptr_NC
=
input
+
n
*
inp_sN
;
auto
out_ptr_NCDHW
=
output
+
n
*
out_sN
+
d
*
out_sD
+
h
*
out_sH
+
w
*
out_sW
;
for
(
int
c
=
0
;
c
<
C
;
++
c
,
inp_ptr_NC
+=
inp_sC
,
out_ptr_NCDHW
+=
out_sC
)
{
// (c, iz_tnw, iy_tnw, ix_tnw) * tnw + (c, iz_tne, iy_tne, ix_tne) *
// tne
// + (c, iz_tsw, iy_tsw, ix_tsw) * tsw + (c, iz_tse, iy_tse, ix_tse) *
// tse
// + (c, iz_bnw, iy_bnw, ix_bnw) * bnw + (c, iz_bne, iy_bne, ix_bne) *
// bne
// + (c, iz_bsw, iy_bsw, ix_bsw) * bsw + (c, iz_bse, iy_bse, ix_bse) *
// bse
*
out_ptr_NCDHW
=
static_cast
<
scalar_t
>
(
0
);
if
(
within_bounds_3d
(
iz_tnw
,
iy_tnw
,
ix_tnw
,
inp_D
,
inp_H
,
inp_W
))
{
*
out_ptr_NCDHW
+=
inp_ptr_NC
[
iz_tnw
*
inp_sD
+
iy_tnw
*
inp_sH
+
ix_tnw
*
inp_sW
]
*
tnw
;
}
if
(
within_bounds_3d
(
iz_tne
,
iy_tne
,
ix_tne
,
inp_D
,
inp_H
,
inp_W
))
{
*
out_ptr_NCDHW
+=
inp_ptr_NC
[
iz_tne
*
inp_sD
+
iy_tne
*
inp_sH
+
ix_tne
*
inp_sW
]
*
tne
;
}
if
(
within_bounds_3d
(
iz_tsw
,
iy_tsw
,
ix_tsw
,
inp_D
,
inp_H
,
inp_W
))
{
*
out_ptr_NCDHW
+=
inp_ptr_NC
[
iz_tsw
*
inp_sD
+
iy_tsw
*
inp_sH
+
ix_tsw
*
inp_sW
]
*
tsw
;
}
if
(
within_bounds_3d
(
iz_tse
,
iy_tse
,
ix_tse
,
inp_D
,
inp_H
,
inp_W
))
{
*
out_ptr_NCDHW
+=
inp_ptr_NC
[
iz_tse
*
inp_sD
+
iy_tse
*
inp_sH
+
ix_tse
*
inp_sW
]
*
tse
;
}
if
(
within_bounds_3d
(
iz_bnw
,
iy_bnw
,
ix_bnw
,
inp_D
,
inp_H
,
inp_W
))
{
*
out_ptr_NCDHW
+=
inp_ptr_NC
[
iz_bnw
*
inp_sD
+
iy_bnw
*
inp_sH
+
ix_bnw
*
inp_sW
]
*
bnw
;
}
if
(
within_bounds_3d
(
iz_bne
,
iy_bne
,
ix_bne
,
inp_D
,
inp_H
,
inp_W
))
{
*
out_ptr_NCDHW
+=
inp_ptr_NC
[
iz_bne
*
inp_sD
+
iy_bne
*
inp_sH
+
ix_bne
*
inp_sW
]
*
bne
;
}
if
(
within_bounds_3d
(
iz_bsw
,
iy_bsw
,
ix_bsw
,
inp_D
,
inp_H
,
inp_W
))
{
*
out_ptr_NCDHW
+=
inp_ptr_NC
[
iz_bsw
*
inp_sD
+
iy_bsw
*
inp_sH
+
ix_bsw
*
inp_sW
]
*
bsw
;
}
if
(
within_bounds_3d
(
iz_bse
,
iy_bse
,
ix_bse
,
inp_D
,
inp_H
,
inp_W
))
{
*
out_ptr_NCDHW
+=
inp_ptr_NC
[
iz_bse
*
inp_sD
+
iy_bse
*
inp_sH
+
ix_bse
*
inp_sW
]
*
bse
;
}
}
}
else
if
(
interpolation_mode
==
GridSamplerInterpolation
::
Nearest
)
{
int
ix_nearest
=
static_cast
<
int
>
(
::
round
(
ix
));
int
iy_nearest
=
static_cast
<
int
>
(
::
round
(
iy
));
int
iz_nearest
=
static_cast
<
int
>
(
::
round
(
iz
));
// assign nearest neighbor pixel value to output pixel
auto
inp_ptr_NC
=
input
+
n
*
inp_sN
;
auto
out_ptr_NCDHW
=
output
+
n
*
out_sN
+
d
*
out_sD
+
h
*
out_sH
+
w
*
out_sW
;
for
(
int
c
=
0
;
c
<
C
;
++
c
,
inp_ptr_NC
+=
inp_sC
,
out_ptr_NCDHW
+=
out_sC
)
{
if
(
within_bounds_3d
(
iz_nearest
,
iy_nearest
,
ix_nearest
,
inp_D
,
inp_H
,
inp_W
))
{
*
out_ptr_NCDHW
=
inp_ptr_NC
[
iz_nearest
*
inp_sD
+
iy_nearest
*
inp_sH
+
ix_nearest
*
inp_sW
];
}
else
{
*
out_ptr_NCDHW
=
static_cast
<
scalar_t
>
(
0
);
}
}
}
}
}
void
create_desc
(
const
int
*
dims
,
int
nb_dims
,
TensorDesc
&
desc
)
{
memcpy
(
&
desc
.
shape
[
0
],
dims
,
sizeof
(
int
)
*
nb_dims
);
desc
.
stride
[
nb_dims
-
1
]
=
1
;
for
(
int
i
=
nb_dims
-
2
;
i
>=
0
;
--
i
)
{
desc
.
stride
[
i
]
=
desc
.
stride
[
i
+
1
]
*
desc
.
shape
[
i
+
1
];
}
}
template
<
typename
T
>
void
grid_sample
(
T
*
output
,
const
T
*
input
,
const
T
*
grid
,
int
*
output_dims
,
int
*
input_dims
,
int
*
grid_dims
,
int
nb_dims
,
GridSamplerInterpolation
interp
,
GridSamplerPadding
padding
,
bool
align_corners
,
cudaStream_t
stream
)
{
TensorDesc
input_desc
;
create_desc
(
input_dims
,
nb_dims
,
input_desc
);
TensorDesc
output_desc
;
create_desc
(
output_dims
,
nb_dims
,
output_desc
);
TensorDesc
grid_desc
;
create_desc
(
grid_dims
,
nb_dims
,
grid_desc
);
int
count
=
1
;
for
(
int
i
=
0
;
i
<
nb_dims
;
++
i
)
{
if
(
i
==
1
)
{
continue
;
}
count
*=
output_desc
.
shape
[
i
];
}
if
(
nb_dims
==
4
)
{
grid_sampler_2d_kernel
<
T
>
<<<
GET_BLOCKS
(
count
),
THREADS_PER_BLOCK
,
0
,
stream
>>>
(
count
,
input
,
grid
,
output
,
input_desc
,
grid_desc
,
output_desc
,
interp
,
padding
,
align_corners
);
}
else
if
(
nb_dims
==
5
)
{
grid_sampler_3d_kernel
<
T
>
<<<
GET_BLOCKS
(
count
),
THREADS_PER_BLOCK
,
0
,
stream
>>>
(
count
,
input
,
grid
,
output
,
input_desc
,
grid_desc
,
output_desc
,
interp
,
padding
,
align_corners
);
}
else
{
printf
(
"input and grid dims should be 4 or 5
\n
"
);
}
}
void
grid_sample_float
(
float
*
output
,
const
float
*
input
,
const
float
*
grid
,
int
*
output_dims
,
int
*
input_dims
,
int
*
grid_dims
,
int
nb_dims
,
GridSamplerInterpolation
interp
,
GridSamplerPadding
padding
,
bool
align_corners
,
cudaStream_t
stream
)
{
grid_sample
<
float
>
(
output
,
input
,
grid
,
output_dims
,
input_dims
,
grid_dims
,
nb_dims
,
interp
,
padding
,
align_corners
,
stream
);
}
mmcv/ops/csrc/tensorrt/plugins/trt_instance_norm.cpp
deleted
100644 → 0
View file @
961373ad
// Copyright (c) OpenMMLab. All rights reserved
// Modified from:
// https://github.com/NVIDIA/TensorRT/blob/master/plugin/instanceNormalizationPlugin/instanceNormalizationPlugin.cpp
#include "trt_instance_norm.hpp"
#include <cuda_fp16.h>
#include <stdexcept>
#include "trt_serialize.hpp"
using
namespace
nvinfer1
;
cudnnStatus_t
convert_trt2cudnn_dtype
(
nvinfer1
::
DataType
trt_dtype
,
cudnnDataType_t
*
cudnn_dtype
)
{
switch
(
trt_dtype
)
{
case
nvinfer1
::
DataType
::
kFLOAT
:
*
cudnn_dtype
=
CUDNN_DATA_FLOAT
;
break
;
case
nvinfer1
::
DataType
::
kHALF
:
*
cudnn_dtype
=
CUDNN_DATA_HALF
;
break
;
default:
return
CUDNN_STATUS_BAD_PARAM
;
}
return
CUDNN_STATUS_SUCCESS
;
}
namespace
{
constexpr
const
char
*
PLUGIN_VERSION
{
"1"
};
constexpr
const
char
*
PLUGIN_NAME
{
"MMCVInstanceNormalization"
};
}
// namespace
PluginFieldCollection
InstanceNormalizationDynamicCreator
::
mFC
{};
std
::
vector
<
PluginField
>
InstanceNormalizationDynamicCreator
::
mPluginAttributes
;
InstanceNormalizationDynamic
::
InstanceNormalizationDynamic
(
const
std
::
string
&
name
,
float
epsilon
)
:
mLayerName
(
name
),
mEpsilon
(
epsilon
)
{}
InstanceNormalizationDynamic
::
InstanceNormalizationDynamic
(
const
std
::
string
&
name
,
void
const
*
serialData
,
size_t
serialLength
)
:
mLayerName
(
name
)
{
deserialize_value
(
&
serialData
,
&
serialLength
,
&
mEpsilon
);
}
InstanceNormalizationDynamic
::~
InstanceNormalizationDynamic
()
{}
// InstanceNormalizationDynamic returns one output.
int
InstanceNormalizationDynamic
::
getNbOutputs
()
const
{
return
1
;
}
DimsExprs
InstanceNormalizationDynamic
::
getOutputDimensions
(
int
outputIndex
,
const
nvinfer1
::
DimsExprs
*
inputs
,
int
nbInputs
,
nvinfer1
::
IExprBuilder
&
exprBuilder
)
{
nvinfer1
::
DimsExprs
output
(
inputs
[
0
]);
return
output
;
}
int
InstanceNormalizationDynamic
::
initialize
()
{
return
0
;
}
void
InstanceNormalizationDynamic
::
terminate
()
{}
size_t
InstanceNormalizationDynamic
::
getWorkspaceSize
(
const
nvinfer1
::
PluginTensorDesc
*
inputs
,
int
nbInputs
,
const
nvinfer1
::
PluginTensorDesc
*
outputs
,
int
nbOutputs
)
const
{
int
n
=
inputs
[
0
].
dims
.
d
[
0
];
int
c
=
inputs
[
0
].
dims
.
d
[
1
];
int
elem_size
=
mmcv
::
getElementSize
(
inputs
[
1
].
type
);
return
mmcv
::
getAlignedSize
(
n
*
c
*
elem_size
)
*
2
;
}
int
InstanceNormalizationDynamic
::
enqueue
(
const
nvinfer1
::
PluginTensorDesc
*
inputDesc
,
const
nvinfer1
::
PluginTensorDesc
*
outputDesc
,
const
void
*
const
*
inputs
,
void
*
const
*
outputs
,
void
*
workspace
,
cudaStream_t
stream
)
{
nvinfer1
::
Dims
input_dims
=
inputDesc
[
0
].
dims
;
int
n
=
input_dims
.
d
[
0
];
int
c
=
input_dims
.
d
[
1
];
int
h
=
input_dims
.
d
[
2
];
int
w
=
input_dims
.
nbDims
>
3
?
input_dims
.
d
[
3
]
:
1
;
int
elem_size
=
mmcv
::
getElementSize
(
inputDesc
[
1
].
type
);
void
*
n_scales
=
(
void
*
)
workspace
;
void
*
n_bias
=
(
void
*
)(
workspace
+
mmcv
::
getAlignedSize
(
n
*
c
*
elem_size
));
const
void
*
scales
=
(
const
void
*
)
inputs
[
1
];
const
void
*
bias
=
(
const
void
*
)
inputs
[
2
];
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
cudaMemcpyAsync
(
n_scales
+
i
*
c
*
elem_size
,
scales
,
c
*
elem_size
,
cudaMemcpyDeviceToDevice
,
stream
);
cudaMemcpyAsync
(
n_bias
+
i
*
c
*
elem_size
,
bias
,
c
*
elem_size
,
cudaMemcpyDeviceToDevice
,
stream
);
}
cudnnSetTensor4dDescriptor
(
_b_desc
,
CUDNN_TENSOR_NCHW
,
CUDNN_DATA_FLOAT
,
1
,
n
*
c
,
1
,
1
);
cudnnDataType_t
cudnn_dtype
{};
convert_trt2cudnn_dtype
(
inputDesc
[
0
].
type
,
&
cudnn_dtype
);
cudnnSetTensor4dDescriptor
(
_x_desc
,
CUDNN_TENSOR_NCHW
,
cudnn_dtype
,
1
,
n
*
c
,
h
,
w
);
cudnnSetTensor4dDescriptor
(
_y_desc
,
CUDNN_TENSOR_NCHW
,
cudnn_dtype
,
1
,
n
*
c
,
h
,
w
);
float
alpha
=
1
;
float
beta
=
0
;
void
const
*
x_ptr
=
inputs
[
0
];
void
*
y_ptr
=
outputs
[
0
];
cudnnSetStream
(
_cudnn_handle
,
stream
);
// Note: Use of CUDNN_BATCHNORM_SPATIAL_PERSISTENT can cause numerical
// overflows (NaNs) for fp32 data in some circumstances. The lower-
// performance CUDNN_BATCHNORM_SPATIAL should be used if this is not
// acceptable.
cudnnBatchNormalizationForwardTraining
(
_cudnn_handle
,
CUDNN_BATCHNORM_SPATIAL_PERSISTENT
,
&
alpha
,
&
beta
,
_x_desc
,
x_ptr
,
_y_desc
,
y_ptr
,
_b_desc
,
n_scales
,
n_bias
,
1.
,
nullptr
,
nullptr
,
mEpsilon
,
nullptr
,
nullptr
);
return
0
;
}
size_t
InstanceNormalizationDynamic
::
getSerializationSize
()
const
{
return
serialized_size
(
mEpsilon
);
}
void
InstanceNormalizationDynamic
::
serialize
(
void
*
buffer
)
const
{
serialize_value
(
&
buffer
,
mEpsilon
);
}
bool
InstanceNormalizationDynamic
::
supportsFormatCombination
(
int
pos
,
const
nvinfer1
::
PluginTensorDesc
*
inOut
,
int
nbInputs
,
int
nbOutputs
)
{
return
((
inOut
[
pos
].
type
==
nvinfer1
::
DataType
::
kFLOAT
||
inOut
[
pos
].
type
==
nvinfer1
::
DataType
::
kHALF
)
&&
inOut
[
pos
].
format
==
nvinfer1
::
PluginFormat
::
kLINEAR
&&
inOut
[
pos
].
type
==
inOut
[
0
].
type
);
}
const
char
*
InstanceNormalizationDynamic
::
getPluginType
()
const
{
return
PLUGIN_NAME
;
}
const
char
*
InstanceNormalizationDynamic
::
getPluginVersion
()
const
{
return
PLUGIN_VERSION
;
}
void
InstanceNormalizationDynamic
::
destroy
()
{
delete
this
;
}
IPluginV2DynamicExt
*
InstanceNormalizationDynamic
::
clone
()
const
{
auto
*
plugin
=
new
InstanceNormalizationDynamic
{
mLayerName
,
mEpsilon
};
plugin
->
setPluginNamespace
(
mPluginNamespace
.
c_str
());
return
plugin
;
}
// Set plugin namespace
void
InstanceNormalizationDynamic
::
setPluginNamespace
(
const
char
*
pluginNamespace
)
{
mPluginNamespace
=
pluginNamespace
;
}
const
char
*
InstanceNormalizationDynamic
::
getPluginNamespace
()
const
{
return
mPluginNamespace
.
c_str
();
}
nvinfer1
::
DataType
InstanceNormalizationDynamic
::
getOutputDataType
(
int
index
,
const
nvinfer1
::
DataType
*
inputTypes
,
int
nbInputs
)
const
{
return
inputTypes
[
0
];
}
// Attach the plugin object to an execution context and grant the plugin the
// access to some context resource.
void
InstanceNormalizationDynamic
::
attachToContext
(
cudnnContext
*
cudnnContext
,
cublasContext
*
cublasContext
,
IGpuAllocator
*
gpuAllocator
)
{
_cudnn_handle
=
cudnnContext
;
cudnnCreateTensorDescriptor
(
&
_b_desc
);
cudnnCreateTensorDescriptor
(
&
_x_desc
);
cudnnCreateTensorDescriptor
(
&
_y_desc
);
}
// Detach the plugin object from its execution context.
void
InstanceNormalizationDynamic
::
detachFromContext
()
{
cudnnDestroyTensorDescriptor
(
_y_desc
);
cudnnDestroyTensorDescriptor
(
_x_desc
);
cudnnDestroyTensorDescriptor
(
_b_desc
);
}
void
InstanceNormalizationDynamic
::
configurePlugin
(
const
nvinfer1
::
DynamicPluginTensorDesc
*
in
,
int
nbInputs
,
const
nvinfer1
::
DynamicPluginTensorDesc
*
out
,
int
nbOutputs
)
{}
// InstanceNormalizationDynamicCreator methods
InstanceNormalizationDynamicCreator
::
InstanceNormalizationDynamicCreator
()
{
mPluginAttributes
.
clear
();
mPluginAttributes
.
emplace_back
(
PluginField
(
"epsilon"
,
nullptr
,
PluginFieldType
::
kFLOAT32
,
1
));
mFC
.
nbFields
=
mPluginAttributes
.
size
();
mFC
.
fields
=
mPluginAttributes
.
data
();
}
const
char
*
InstanceNormalizationDynamicCreator
::
getPluginName
()
const
{
return
PLUGIN_NAME
;
}
const
char
*
InstanceNormalizationDynamicCreator
::
getPluginVersion
()
const
{
return
PLUGIN_VERSION
;
}
const
PluginFieldCollection
*
InstanceNormalizationDynamicCreator
::
getFieldNames
()
{
return
&
mFC
;
}
IPluginV2DynamicExt
*
InstanceNormalizationDynamicCreator
::
createPlugin
(
const
char
*
name
,
const
nvinfer1
::
PluginFieldCollection
*
fc
)
{
float
epsilon
=
1e-5
;
const
PluginField
*
fields
=
fc
->
fields
;
for
(
int
i
=
0
;
i
<
fc
->
nbFields
;
++
i
)
{
const
char
*
attrName
=
fields
[
i
].
name
;
if
(
!
strcmp
(
attrName
,
"epsilon"
))
{
epsilon
=
*
(
static_cast
<
const
float
*>
(
fields
[
i
].
data
));
}
}
InstanceNormalizationDynamic
*
obj
=
new
InstanceNormalizationDynamic
(
name
,
epsilon
);
obj
->
setPluginNamespace
(
mNamespace
.
c_str
());
return
obj
;
}
IPluginV2DynamicExt
*
InstanceNormalizationDynamicCreator
::
deserializePlugin
(
const
char
*
name
,
const
void
*
serialData
,
size_t
serialLength
)
{
InstanceNormalizationDynamic
*
obj
=
new
InstanceNormalizationDynamic
{
name
,
serialData
,
serialLength
};
obj
->
setPluginNamespace
(
mNamespace
.
c_str
());
return
obj
;
}
void
InstanceNormalizationDynamicCreator
::
setPluginNamespace
(
const
char
*
libNamespace
)
{
mNamespace
=
libNamespace
;
}
const
char
*
InstanceNormalizationDynamicCreator
::
getPluginNamespace
()
const
{
return
mNamespace
.
c_str
();
}
mmcv/ops/csrc/tensorrt/plugins/trt_modulated_deform_conv.cpp
deleted
100644 → 0
View file @
961373ad
// Copyright (c) OpenMMLab. All rights reserved
#include "trt_modulated_deform_conv.hpp"
#include <assert.h>
#include <chrono>
#include "trt_serialize.hpp"
void
ModulatedDeformConvForwardCUDAKernelLauncher_float
(
const
float
*
input
,
const
float
*
weight
,
const
float
*
bias
,
const
float
*
offset
,
const
float
*
mask
,
float
*
output
,
void
*
workspace
,
int
batch
,
int
channels
,
int
height
,
int
width
,
int
channels_out
,
int
kernel_w
,
int
kernel_h
,
int
stride_w
,
int
stride_h
,
int
pad_w
,
int
pad_h
,
int
dilation_w
,
int
dilation_h
,
int
group
,
int
deformable_group
,
int
im2col_step
,
cublasHandle_t
cublas_handle
,
cudaStream_t
stream
);
namespace
{
static
const
char
*
PLUGIN_VERSION
{
"1"
};
static
const
char
*
PLUGIN_NAME
{
"MMCVModulatedDeformConv2d"
};
}
// namespace
nvinfer1
::
PluginFieldCollection
ModulatedDeformableConvPluginDynamicCreator
::
mFC
{};
std
::
vector
<
nvinfer1
::
PluginField
>
ModulatedDeformableConvPluginDynamicCreator
::
mPluginAttributes
;
ModulatedDeformableConvPluginDynamic
::
ModulatedDeformableConvPluginDynamic
(
const
std
::
string
&
name
,
const
nvinfer1
::
Dims
stride
,
const
nvinfer1
::
Dims
padding
,
const
nvinfer1
::
Dims
dilation
,
const
int
deformableGroup
,
const
int
group
)
:
mLayerName
(
name
),
mStride
(
stride
),
mPadding
(
padding
),
mDilation
(
dilation
),
mDeformableGroup
(
deformableGroup
),
mGroup
(
group
)
{
mWithBias
=
false
;
}
ModulatedDeformableConvPluginDynamic
::
ModulatedDeformableConvPluginDynamic
(
const
std
::
string
name
,
const
void
*
data
,
size_t
length
)
:
mLayerName
(
name
)
{
deserialize_value
(
&
data
,
&
length
,
&
mStride
);
deserialize_value
(
&
data
,
&
length
,
&
mPadding
);
deserialize_value
(
&
data
,
&
length
,
&
mDilation
);
deserialize_value
(
&
data
,
&
length
,
&
mDeformableGroup
);
deserialize_value
(
&
data
,
&
length
,
&
mGroup
);
mWithBias
=
false
;
}
ModulatedDeformableConvPluginDynamic
::~
ModulatedDeformableConvPluginDynamic
()
{}
nvinfer1
::
IPluginV2DynamicExt
*
ModulatedDeformableConvPluginDynamic
::
clone
()
const
{
ModulatedDeformableConvPluginDynamic
*
plugin
=
new
ModulatedDeformableConvPluginDynamic
(
mLayerName
,
mStride
,
mPadding
,
mDilation
,
mDeformableGroup
,
mGroup
);
plugin
->
setPluginNamespace
(
getPluginNamespace
());
return
plugin
;
}
nvinfer1
::
DimsExprs
ModulatedDeformableConvPluginDynamic
::
getOutputDimensions
(
int
outputIndex
,
const
nvinfer1
::
DimsExprs
*
inputs
,
int
nbInputs
,
nvinfer1
::
IExprBuilder
&
exprBuilder
)
{
nvinfer1
::
DimsExprs
ret
;
ret
.
nbDims
=
4
;
ret
.
d
[
0
]
=
inputs
[
0
].
d
[
0
];
ret
.
d
[
1
]
=
inputs
[
3
].
d
[
0
];
ret
.
d
[
2
]
=
inputs
[
1
].
d
[
2
];
ret
.
d
[
3
]
=
inputs
[
1
].
d
[
3
];
return
ret
;
}
bool
ModulatedDeformableConvPluginDynamic
::
supportsFormatCombination
(
int
pos
,
const
nvinfer1
::
PluginTensorDesc
*
inOut
,
int
nbInputs
,
int
nbOutputs
)
{
if
(
pos
==
0
)
{
return
(
inOut
[
pos
].
type
==
nvinfer1
::
DataType
::
kFLOAT
&&
inOut
[
pos
].
format
==
nvinfer1
::
TensorFormat
::
kLINEAR
);
}
else
{
return
inOut
[
pos
].
type
==
inOut
[
0
].
type
&&
inOut
[
pos
].
format
==
inOut
[
0
].
format
;
}
}
void
ModulatedDeformableConvPluginDynamic
::
configurePlugin
(
const
nvinfer1
::
DynamicPluginTensorDesc
*
inputs
,
int
nbInputs
,
const
nvinfer1
::
DynamicPluginTensorDesc
*
outputs
,
int
nbOutputs
)
{
if
(
nbInputs
==
5
)
{
mWithBias
=
true
;
}
}
size_t
ModulatedDeformableConvPluginDynamic
::
getWorkspaceSize
(
const
nvinfer1
::
PluginTensorDesc
*
inputs
,
int
nbInputs
,
const
nvinfer1
::
PluginTensorDesc
*
outputs
,
int
nbOutputs
)
const
{
int
sizeof_dtype
=
mmcv
::
getElementSize
(
outputs
[
0
].
type
);
int
batch_size
=
inputs
[
0
].
dims
.
d
[
0
];
int
nInputPlane
=
inputs
[
0
].
dims
.
d
[
1
];
int
inputHeight
=
inputs
[
0
].
dims
.
d
[
2
];
int
inputWidth
=
inputs
[
0
].
dims
.
d
[
3
];
int
nOutputPlane
=
outputs
[
0
].
dims
.
d
[
1
];
int
outputHeight
=
outputs
[
0
].
dims
.
d
[
2
];
int
outputWidth
=
outputs
[
0
].
dims
.
d
[
3
];
int
kW
=
inputs
[
3
].
dims
.
d
[
2
];
int
kH
=
inputs
[
3
].
dims
.
d
[
3
];
int
im2col_step
=
std
::
min
(
32
,
batch_size
);
size_t
col_size
=
mmcv
::
getAlignedSize
(
nInputPlane
*
kW
*
kH
*
outputHeight
*
outputWidth
*
sizeof_dtype
);
return
col_size
;
}
int
ModulatedDeformableConvPluginDynamic
::
enqueue
(
const
nvinfer1
::
PluginTensorDesc
*
inputDesc
,
const
nvinfer1
::
PluginTensorDesc
*
outputDesc
,
const
void
*
const
*
inputs
,
void
*
const
*
outputs
,
void
*
workSpace
,
cudaStream_t
stream
)
{
int
batch
=
inputDesc
[
0
].
dims
.
d
[
0
];
int
channels
=
inputDesc
[
0
].
dims
.
d
[
1
];
int
height
=
inputDesc
[
0
].
dims
.
d
[
2
];
int
width
=
inputDesc
[
0
].
dims
.
d
[
3
];
int
channels_out
=
outputDesc
[
0
].
dims
.
d
[
1
];
int
kernel_h
=
inputDesc
[
3
].
dims
.
d
[
2
];
int
kernel_w
=
inputDesc
[
3
].
dims
.
d
[
3
];
const
void
*
x
=
inputs
[
0
];
const
void
*
offset
=
inputs
[
1
];
const
void
*
mask
=
inputs
[
2
];
const
void
*
weight
=
inputs
[
3
];
const
void
*
bias
=
mWithBias
?
inputs
[
4
]
:
nullptr
;
void
*
output
=
outputs
[
0
];
int
im2col_step
=
std
::
min
(
batch
,
32
);
// TODO: add fp16 support
auto
data_type
=
inputDesc
[
0
].
type
;
switch
(
data_type
)
{
case
nvinfer1
::
DataType
::
kFLOAT
:
ModulatedDeformConvForwardCUDAKernelLauncher_float
(
(
float
*
)
x
,
(
float
*
)
weight
,
(
float
*
)
bias
,
(
float
*
)
offset
,
(
float
*
)
mask
,
(
float
*
)
output
,
workSpace
,
batch
,
channels
,
height
,
width
,
channels_out
,
kernel_w
,
kernel_h
,
mStride
.
d
[
0
],
mStride
.
d
[
1
],
mPadding
.
d
[
0
],
mPadding
.
d
[
1
],
mDilation
.
d
[
0
],
mDilation
.
d
[
1
],
mGroup
,
mDeformableGroup
,
im2col_step
,
m_cublas_handle
,
stream
);
break
;
default:
return
1
;
break
;
}
return
0
;
}
nvinfer1
::
DataType
ModulatedDeformableConvPluginDynamic
::
getOutputDataType
(
int
index
,
const
nvinfer1
::
DataType
*
inputTypes
,
int
nbInputs
)
const
{
return
inputTypes
[
0
];
}
// IPluginV2 Methods
const
char
*
ModulatedDeformableConvPluginDynamic
::
getPluginType
()
const
{
return
PLUGIN_NAME
;
}
const
char
*
ModulatedDeformableConvPluginDynamic
::
getPluginVersion
()
const
{
return
PLUGIN_VERSION
;
}
int
ModulatedDeformableConvPluginDynamic
::
getNbOutputs
()
const
{
return
1
;
}
int
ModulatedDeformableConvPluginDynamic
::
initialize
()
{
return
0
;
}
void
ModulatedDeformableConvPluginDynamic
::
terminate
()
{}
size_t
ModulatedDeformableConvPluginDynamic
::
getSerializationSize
()
const
{
return
sizeof
(
mStride
)
+
sizeof
(
mPadding
)
+
sizeof
(
mDilation
)
+
sizeof
(
mDeformableGroup
)
+
sizeof
(
mGroup
);
}
void
ModulatedDeformableConvPluginDynamic
::
serialize
(
void
*
buffer
)
const
{
serialize_value
(
&
buffer
,
mStride
);
serialize_value
(
&
buffer
,
mPadding
);
serialize_value
(
&
buffer
,
mDilation
);
serialize_value
(
&
buffer
,
mDeformableGroup
);
serialize_value
(
&
buffer
,
mGroup
);
}
void
ModulatedDeformableConvPluginDynamic
::
destroy
()
{
// This gets called when the network containing plugin is destroyed
delete
this
;
}
void
ModulatedDeformableConvPluginDynamic
::
attachToContext
(
cudnnContext
*
cudnnContext
,
cublasContext
*
cublasContext
,
nvinfer1
::
IGpuAllocator
*
gpuAllocator
)
{
m_cublas_handle
=
cublasContext
;
}
void
ModulatedDeformableConvPluginDynamic
::
detachFromContext
()
{}
void
ModulatedDeformableConvPluginDynamic
::
setPluginNamespace
(
const
char
*
libNamespace
)
{
mNamespace
=
libNamespace
;
}
const
char
*
ModulatedDeformableConvPluginDynamic
::
getPluginNamespace
()
const
{
return
mNamespace
.
c_str
();
}
////////////////////// creator /////////////////////////////
ModulatedDeformableConvPluginDynamicCreator
::
ModulatedDeformableConvPluginDynamicCreator
()
{
mPluginAttributes
.
emplace_back
(
nvinfer1
::
PluginField
(
"stride"
));
mPluginAttributes
.
emplace_back
(
nvinfer1
::
PluginField
(
"padding"
));
mPluginAttributes
.
emplace_back
(
nvinfer1
::
PluginField
(
"dilation"
));
mPluginAttributes
.
emplace_back
(
nvinfer1
::
PluginField
(
"groups"
));
mPluginAttributes
.
emplace_back
(
nvinfer1
::
PluginField
(
"deform_groups"
));
mFC
.
nbFields
=
mPluginAttributes
.
size
();
mFC
.
fields
=
mPluginAttributes
.
data
();
}
const
char
*
ModulatedDeformableConvPluginDynamicCreator
::
getPluginName
()
const
{
return
PLUGIN_NAME
;
}
const
char
*
ModulatedDeformableConvPluginDynamicCreator
::
getPluginVersion
()
const
{
return
PLUGIN_VERSION
;
}
const
nvinfer1
::
PluginFieldCollection
*
ModulatedDeformableConvPluginDynamicCreator
::
getFieldNames
()
{
return
&
mFC
;
}
nvinfer1
::
IPluginV2
*
ModulatedDeformableConvPluginDynamicCreator
::
createPlugin
(
const
char
*
name
,
const
nvinfer1
::
PluginFieldCollection
*
fc
)
{
nvinfer1
::
Dims
stride
{
2
,
{
1
,
1
}};
nvinfer1
::
Dims
padding
{
2
,
{
0
,
0
}};
nvinfer1
::
Dims
dilation
{
2
,
{
1
,
1
}};
int
deformableGroup
=
1
;
int
group
=
1
;
for
(
int
i
=
0
;
i
<
fc
->
nbFields
;
i
++
)
{
if
(
fc
->
fields
[
i
].
data
==
nullptr
)
{
continue
;
}
std
::
string
field_name
(
fc
->
fields
[
i
].
name
);
if
(
field_name
.
compare
(
"deform_groups"
)
==
0
)
{
deformableGroup
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
0
];
}
if
(
field_name
.
compare
(
"group"
)
==
0
)
{
group
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
0
];
}
if
(
field_name
.
compare
(
"stride"
)
==
0
)
{
stride
.
nbDims
=
2
;
stride
.
d
[
0
]
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
0
];
stride
.
d
[
1
]
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
1
];
}
if
(
field_name
.
compare
(
"padding"
)
==
0
)
{
padding
.
nbDims
=
2
;
padding
.
d
[
0
]
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
0
];
padding
.
d
[
1
]
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
1
];
}
if
(
field_name
.
compare
(
"dilation"
)
==
0
)
{
dilation
.
nbDims
=
2
;
dilation
.
d
[
0
]
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
0
];
dilation
.
d
[
1
]
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
1
];
}
}
ModulatedDeformableConvPluginDynamic
*
plugin
=
new
ModulatedDeformableConvPluginDynamic
(
name
,
stride
,
padding
,
dilation
,
deformableGroup
,
group
);
plugin
->
setPluginNamespace
(
getPluginNamespace
());
return
plugin
;
}
nvinfer1
::
IPluginV2
*
ModulatedDeformableConvPluginDynamicCreator
::
deserializePlugin
(
const
char
*
name
,
const
void
*
serialData
,
size_t
serialLength
)
{
auto
plugin
=
new
ModulatedDeformableConvPluginDynamic
(
name
,
serialData
,
serialLength
);
plugin
->
setPluginNamespace
(
getPluginNamespace
());
return
plugin
;
}
void
ModulatedDeformableConvPluginDynamicCreator
::
setPluginNamespace
(
const
char
*
libNamespace
)
{
mNamespace
=
libNamespace
;
}
const
char
*
ModulatedDeformableConvPluginDynamicCreator
::
getPluginNamespace
()
const
{
return
mNamespace
.
c_str
();
}
mmcv/ops/csrc/tensorrt/plugins/trt_modulated_deform_conv_kernel.cu
deleted
100644 → 0
View file @
961373ad
// Copyright (c) OpenMMLab. All rights reserved
#include <assert.h>
#include <cuda_fp16.h>
#include "common_cuda_helper.hpp"
#include "modulated_deform_conv_cuda_kernel.cuh"
#include "trt_cuda_helper.cuh"
#include "trt_plugin_helper.hpp"
template
<
typename
T
>
void
trt_modulated_deformable_im2col
(
const
T
*
data_im_
,
const
T
*
data_offset_
,
const
T
*
data_mask_
,
const
int
batch_size
,
const
int
channels
,
const
int
height_im
,
const
int
width_im
,
const
int
height_col
,
const
int
width_col
,
const
int
kernel_h
,
const
int
kenerl_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
deformable_group
,
T
*
data_col_
,
cudaStream_t
stream
)
{
// num_axes should be smaller than block size
const
int
channel_per_deformable_group
=
channels
/
deformable_group
;
const
int
num_kernels
=
channels
*
batch_size
*
height_col
*
width_col
;
modulated_deformable_im2col_gpu_kernel
<
T
>
<<<
GET_BLOCKS
(
num_kernels
),
THREADS_PER_BLOCK
,
0
,
stream
>>>
(
num_kernels
,
data_im_
,
data_offset_
,
data_mask_
,
height_im
,
width_im
,
kernel_h
,
kenerl_w
,
pad_h
,
pad_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
channel_per_deformable_group
,
batch_size
,
channels
,
deformable_group
,
height_col
,
width_col
,
data_col_
);
cudaCheckError
();
}
template
<
typename
scalar_t
>
__global__
void
output_add_bias_kernel
(
scalar_t
*
output
,
const
scalar_t
*
bias
,
size_t
step_batch
,
size_t
step_channel
,
size_t
n
)
{
CUDA_1D_KERNEL_LOOP
(
index
,
n
)
{
output
[
index
]
+=
bias
[(
index
%
step_batch
)
/
step_channel
];
}
}
template
<
typename
scalar_t
>
static
void
output_add_bias
(
scalar_t
*
output
,
const
scalar_t
*
bias
,
size_t
batch
,
size_t
channel
,
size_t
height
,
size_t
width
,
cudaStream_t
stream
)
{
size_t
step_channel
=
height
*
width
;
size_t
step_batch
=
step_channel
*
channel
;
size_t
n
=
step_batch
*
batch
;
output_add_bias_kernel
<<<
GET_BLOCKS
(
n
),
THREADS_PER_BLOCK
,
0
,
stream
>>>
(
output
,
bias
,
step_batch
,
step_channel
,
n
);
}
template
<
typename
scalar_t
>
void
ModulatedDeformConvForwardCUDAKernelLauncher
(
const
scalar_t
*
input
,
const
scalar_t
*
weight
,
const
scalar_t
*
bias
,
const
scalar_t
*
offset
,
const
scalar_t
*
mask
,
scalar_t
*
output
,
void
*
workspace
,
int
batch
,
int
channels
,
int
height
,
int
width
,
int
channels_out
,
int
kernel_w
,
int
kernel_h
,
int
stride_w
,
int
stride_h
,
int
pad_w
,
int
pad_h
,
int
dilation_w
,
int
dilation_h
,
int
group
,
int
deformable_group
,
int
im2col_step
,
cublasHandle_t
cublas_handle
,
cudaStream_t
stream
)
{
size_t
sizeof_dtype
=
sizeof
(
scalar_t
);
bool
with_bias
=
(
bias
!=
nullptr
);
im2col_step
=
std
::
min
(
int
(
batch
),
im2col_step
);
assert
(
batch
%
im2col_step
==
0
);
const
int
channels_kernel
=
channels
/
group
;
const
int
height_out
=
(
height
+
2
*
pad_h
-
(
dilation_h
*
(
kernel_h
-
1
)
+
1
))
/
stride_h
+
1
;
const
int
width_out
=
(
width
+
2
*
pad_w
-
(
dilation_w
*
(
kernel_w
-
1
)
+
1
))
/
stride_w
+
1
;
scalar_t
*
columns
=
(
scalar_t
*
)
workspace
;
const
size_t
input_step
=
channels
*
height
*
width
;
const
size_t
offset_step
=
deformable_group
*
kernel_h
*
kernel_w
*
2
*
height
*
width
;
const
size_t
mask_step
=
deformable_group
*
kernel_h
*
kernel_w
*
height
*
width
;
const
size_t
out_step
=
channels_out
*
height_out
*
width_out
;
const
size_t
out_group_step
=
out_step
/
group
;
const
size_t
col_g_step
=
channels
*
kernel_w
*
kernel_h
/
group
*
height_out
*
width_out
;
const
size_t
weight_g_step
=
channels_out
/
group
*
channels
/
group
*
kernel_h
*
kernel_w
;
const
int
m
=
channels_out
/
group
;
const
int
n
=
height_out
*
width_out
;
const
int
k
=
channels
/
group
*
kernel_h
*
kernel_w
;
scalar_t
alpha
=
1.
;
scalar_t
beta
=
0.
;
for
(
int
b
=
0
;
b
<
batch
;
b
++
)
{
const
scalar_t
*
input_start
=
input
+
b
*
input_step
;
const
scalar_t
*
offset_start
=
offset
+
b
*
offset_step
;
const
scalar_t
*
mask_start
=
mask
+
b
*
mask_step
;
trt_modulated_deformable_im2col
<
scalar_t
>
(
input_start
,
offset_start
,
mask_start
,
1
,
channels
,
height
,
width
,
height_out
,
width_out
,
kernel_h
,
kernel_w
,
pad_h
,
pad_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
deformable_group
,
columns
,
stream
);
for
(
int
g
=
0
;
g
<
group
;
g
++
)
{
const
scalar_t
*
weight_start
=
weight
+
g
*
weight_g_step
;
scalar_t
*
col_start
=
columns
+
g
*
col_g_step
;
scalar_t
*
out_buffer_start
=
output
+
b
*
out_step
+
g
*
out_group_step
;
// cudaMemsetAsync(out_buffer_start, 0, 1, stream);
cublasGemmWrap
<
scalar_t
>
(
cublas_handle
,
CUBLAS_OP_N
,
CUBLAS_OP_N
,
n
,
m
,
k
,
&
alpha
,
col_start
,
n
,
weight_start
,
k
,
&
beta
,
out_buffer_start
,
n
);
cudaCheckError
();
}
}
if
(
with_bias
)
{
output_add_bias
<
scalar_t
>
(
output
,
bias
,
batch
,
channels_out
,
height_out
,
width_out
,
stream
);
}
}
void
ModulatedDeformConvForwardCUDAKernelLauncher_float
(
const
float
*
input
,
const
float
*
weight
,
const
float
*
bias
,
const
float
*
offset
,
const
float
*
mask
,
float
*
output
,
void
*
workspace
,
int
batch
,
int
channels
,
int
height
,
int
width
,
int
channels_out
,
int
kernel_w
,
int
kernel_h
,
int
stride_w
,
int
stride_h
,
int
pad_w
,
int
pad_h
,
int
dilation_w
,
int
dilation_h
,
int
group
,
int
deformable_group
,
int
im2col_step
,
cublasHandle_t
cublas_handle
,
cudaStream_t
stream
)
{
ModulatedDeformConvForwardCUDAKernelLauncher
<
float
>
(
input
,
weight
,
bias
,
offset
,
mask
,
output
,
workspace
,
batch
,
channels
,
height
,
width
,
channels_out
,
kernel_w
,
kernel_h
,
stride_w
,
stride_h
,
pad_w
,
pad_h
,
dilation_w
,
dilation_h
,
group
,
deformable_group
,
im2col_step
,
cublas_handle
,
stream
);
}
mmcv/ops/csrc/tensorrt/plugins/trt_nms.cpp
deleted
100644 → 0
View file @
961373ad
// Copyright (c) OpenMMLab. All rights reserved
#include "trt_nms.hpp"
#include <assert.h>
#include <stdio.h>
#include <chrono>
#include "trt_serialize.hpp"
extern
size_t
get_onnxnms_workspace_size
(
size_t
num_batches
,
size_t
spatial_dimension
,
size_t
num_classes
,
size_t
boxes_word_size
,
int
center_point_box
,
size_t
output_length
);
extern
void
TRTNMSCUDAKernelLauncher_float
(
const
float
*
boxes
,
const
float
*
scores
,
const
int
max_output_boxes_per_class
,
const
float
iou_threshold
,
const
float
score_threshold
,
const
int
offset
,
int
*
output
,
int
center_point_box
,
int
num_batches
,
int
spatial_dimension
,
int
num_classes
,
size_t
output_length
,
void
*
workspace
,
cudaStream_t
stream
);
namespace
{
static
const
char
*
PLUGIN_VERSION
{
"1"
};
static
const
char
*
PLUGIN_NAME
{
"NonMaxSuppression"
};
}
// namespace
nvinfer1
::
PluginFieldCollection
NonMaxSuppressionDynamicCreator
::
mFC
{};
std
::
vector
<
nvinfer1
::
PluginField
>
NonMaxSuppressionDynamicCreator
::
mPluginAttributes
;
NonMaxSuppressionDynamic
::
NonMaxSuppressionDynamic
(
const
std
::
string
&
name
,
int
centerPointBox
,
int
maxOutputBoxesPerClass
,
float
iouThreshold
,
float
scoreThreshold
,
int
offset
)
:
mLayerName
(
name
),
mCenterPointBox
(
centerPointBox
),
mMaxOutputBoxesPerClass
(
maxOutputBoxesPerClass
),
mIouThreshold
(
iouThreshold
),
mScoreThreshold
(
scoreThreshold
),
mOffset
(
offset
)
{}
NonMaxSuppressionDynamic
::
NonMaxSuppressionDynamic
(
const
std
::
string
name
,
const
void
*
data
,
size_t
length
)
:
mLayerName
(
name
)
{
deserialize_value
(
&
data
,
&
length
,
&
mCenterPointBox
);
deserialize_value
(
&
data
,
&
length
,
&
mMaxOutputBoxesPerClass
);
deserialize_value
(
&
data
,
&
length
,
&
mIouThreshold
);
deserialize_value
(
&
data
,
&
length
,
&
mScoreThreshold
);
deserialize_value
(
&
data
,
&
length
,
&
mOffset
);
}
nvinfer1
::
IPluginV2DynamicExt
*
NonMaxSuppressionDynamic
::
clone
()
const
{
NonMaxSuppressionDynamic
*
plugin
=
new
NonMaxSuppressionDynamic
(
mLayerName
,
mCenterPointBox
,
mMaxOutputBoxesPerClass
,
mIouThreshold
,
mScoreThreshold
,
mOffset
);
plugin
->
setPluginNamespace
(
getPluginNamespace
());
return
plugin
;
}
nvinfer1
::
DimsExprs
NonMaxSuppressionDynamic
::
getOutputDimensions
(
int
outputIndex
,
const
nvinfer1
::
DimsExprs
*
inputs
,
int
nbInputs
,
nvinfer1
::
IExprBuilder
&
exprBuilder
)
{
nvinfer1
::
DimsExprs
ret
;
ret
.
nbDims
=
2
;
auto
num_batches
=
inputs
[
0
].
d
[
0
];
auto
spatial_dimension
=
inputs
[
0
].
d
[
1
];
if
(
mMaxOutputBoxesPerClass
>
0
)
{
spatial_dimension
=
exprBuilder
.
operation
(
nvinfer1
::
DimensionOperation
::
kMIN
,
*
spatial_dimension
,
*
exprBuilder
.
constant
(
mMaxOutputBoxesPerClass
));
}
auto
num_classes
=
inputs
[
1
].
d
[
1
];
ret
.
d
[
0
]
=
exprBuilder
.
operation
(
nvinfer1
::
DimensionOperation
::
kPROD
,
*
num_batches
,
*
exprBuilder
.
operation
(
nvinfer1
::
DimensionOperation
::
kPROD
,
*
spatial_dimension
,
*
num_classes
));
ret
.
d
[
1
]
=
exprBuilder
.
constant
(
3
);
return
ret
;
}
bool
NonMaxSuppressionDynamic
::
supportsFormatCombination
(
int
pos
,
const
nvinfer1
::
PluginTensorDesc
*
inOut
,
int
nbInputs
,
int
nbOutputs
)
{
if
(
pos
<
nbInputs
)
{
switch
(
pos
)
{
case
0
:
// boxes
return
inOut
[
pos
].
type
==
nvinfer1
::
DataType
::
kFLOAT
&&
inOut
[
pos
].
format
==
nvinfer1
::
TensorFormat
::
kLINEAR
;
case
1
:
// scores
return
inOut
[
pos
].
type
==
nvinfer1
::
DataType
::
kFLOAT
&&
inOut
[
pos
].
format
==
nvinfer1
::
TensorFormat
::
kLINEAR
;
default:
return
true
;
}
}
else
{
switch
(
pos
-
nbInputs
)
{
case
0
:
// selected_indices
return
inOut
[
pos
].
type
==
nvinfer1
::
DataType
::
kINT32
&&
inOut
[
pos
].
format
==
nvinfer1
::
TensorFormat
::
kLINEAR
;
default:
return
true
;
}
}
return
true
;
}
void
NonMaxSuppressionDynamic
::
configurePlugin
(
const
nvinfer1
::
DynamicPluginTensorDesc
*
inputs
,
int
nbInputs
,
const
nvinfer1
::
DynamicPluginTensorDesc
*
outputs
,
int
nbOutputs
)
{}
size_t
NonMaxSuppressionDynamic
::
getWorkspaceSize
(
const
nvinfer1
::
PluginTensorDesc
*
inputs
,
int
nbInputs
,
const
nvinfer1
::
PluginTensorDesc
*
outputs
,
int
nbOutputs
)
const
{
size_t
boxes_word_size
=
mmcv
::
getElementSize
(
inputs
[
0
].
type
);
size_t
num_batches
=
inputs
[
0
].
dims
.
d
[
0
];
size_t
spatial_dimension
=
inputs
[
0
].
dims
.
d
[
1
];
size_t
num_classes
=
inputs
[
1
].
dims
.
d
[
1
];
size_t
output_length
=
outputs
[
0
].
dims
.
d
[
0
];
return
get_onnxnms_workspace_size
(
num_batches
,
spatial_dimension
,
num_classes
,
boxes_word_size
,
mCenterPointBox
,
output_length
);
}
int
NonMaxSuppressionDynamic
::
enqueue
(
const
nvinfer1
::
PluginTensorDesc
*
inputDesc
,
const
nvinfer1
::
PluginTensorDesc
*
outputDesc
,
const
void
*
const
*
inputs
,
void
*
const
*
outputs
,
void
*
workSpace
,
cudaStream_t
stream
)
{
int
num_batches
=
inputDesc
[
0
].
dims
.
d
[
0
];
int
spatial_dimension
=
inputDesc
[
0
].
dims
.
d
[
1
];
int
num_classes
=
inputDesc
[
1
].
dims
.
d
[
1
];
int
output_length
=
outputDesc
[
0
].
dims
.
d
[
0
];
const
float
*
boxes
=
(
const
float
*
)
inputs
[
0
];
const
float
*
scores
=
(
const
float
*
)
inputs
[
1
];
int
*
output
=
(
int
*
)
outputs
[
0
];
TRTNMSCUDAKernelLauncher_float
(
boxes
,
scores
,
mMaxOutputBoxesPerClass
,
mIouThreshold
,
mScoreThreshold
,
mOffset
,
output
,
mCenterPointBox
,
num_batches
,
spatial_dimension
,
num_classes
,
output_length
,
workSpace
,
stream
);
return
0
;
}
nvinfer1
::
DataType
NonMaxSuppressionDynamic
::
getOutputDataType
(
int
index
,
const
nvinfer1
::
DataType
*
inputTypes
,
int
nbInputs
)
const
{
return
nvinfer1
::
DataType
::
kINT32
;
}
// IPluginV2 Methods
const
char
*
NonMaxSuppressionDynamic
::
getPluginType
()
const
{
return
PLUGIN_NAME
;
}
const
char
*
NonMaxSuppressionDynamic
::
getPluginVersion
()
const
{
return
PLUGIN_VERSION
;
}
int
NonMaxSuppressionDynamic
::
getNbOutputs
()
const
{
return
1
;
}
int
NonMaxSuppressionDynamic
::
initialize
()
{
return
0
;
}
void
NonMaxSuppressionDynamic
::
terminate
()
{}
size_t
NonMaxSuppressionDynamic
::
getSerializationSize
()
const
{
return
sizeof
(
mCenterPointBox
)
+
sizeof
(
mMaxOutputBoxesPerClass
)
+
sizeof
(
mIouThreshold
)
+
sizeof
(
mScoreThreshold
)
+
sizeof
(
mOffset
);
}
void
NonMaxSuppressionDynamic
::
serialize
(
void
*
buffer
)
const
{
serialize_value
(
&
buffer
,
mCenterPointBox
);
serialize_value
(
&
buffer
,
mMaxOutputBoxesPerClass
);
serialize_value
(
&
buffer
,
mIouThreshold
);
serialize_value
(
&
buffer
,
mScoreThreshold
);
serialize_value
(
&
buffer
,
mOffset
);
}
void
NonMaxSuppressionDynamic
::
destroy
()
{
// This gets called when the network containing plugin is destroyed
delete
this
;
}
void
NonMaxSuppressionDynamic
::
setPluginNamespace
(
const
char
*
libNamespace
)
{
mNamespace
=
libNamespace
;
}
const
char
*
NonMaxSuppressionDynamic
::
getPluginNamespace
()
const
{
return
mNamespace
.
c_str
();
}
////////////////////// creator /////////////////////////////
NonMaxSuppressionDynamicCreator
::
NonMaxSuppressionDynamicCreator
()
{
mPluginAttributes
.
clear
();
mPluginAttributes
.
emplace_back
(
nvinfer1
::
PluginField
(
"center_point_box"
));
mPluginAttributes
.
emplace_back
(
nvinfer1
::
PluginField
(
"max_output_boxes_per_class"
));
mPluginAttributes
.
emplace_back
(
nvinfer1
::
PluginField
(
"iou_threshold"
));
mPluginAttributes
.
emplace_back
(
nvinfer1
::
PluginField
(
"score_threshold"
));
mPluginAttributes
.
emplace_back
(
nvinfer1
::
PluginField
(
"offset"
));
mFC
.
nbFields
=
mPluginAttributes
.
size
();
mFC
.
fields
=
mPluginAttributes
.
data
();
}
const
char
*
NonMaxSuppressionDynamicCreator
::
getPluginName
()
const
{
return
PLUGIN_NAME
;
}
const
char
*
NonMaxSuppressionDynamicCreator
::
getPluginVersion
()
const
{
return
PLUGIN_VERSION
;
}
const
nvinfer1
::
PluginFieldCollection
*
NonMaxSuppressionDynamicCreator
::
getFieldNames
()
{
return
&
mFC
;
}
nvinfer1
::
IPluginV2
*
NonMaxSuppressionDynamicCreator
::
createPlugin
(
const
char
*
name
,
const
nvinfer1
::
PluginFieldCollection
*
fc
)
{
int
centerPointBox
=
0
;
int
maxOutputBoxesPerClass
=
0
;
float
iouThreshold
=
0.0
f
;
float
scoreThreshold
=
0.0
f
;
int
offset
=
0
;
for
(
int
i
=
0
;
i
<
fc
->
nbFields
;
i
++
)
{
if
(
fc
->
fields
[
i
].
data
==
nullptr
)
{
continue
;
}
std
::
string
field_name
(
fc
->
fields
[
i
].
name
);
if
(
field_name
.
compare
(
"center_point_box"
)
==
0
)
{
centerPointBox
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
0
];
}
if
(
field_name
.
compare
(
"max_output_boxes_per_class"
)
==
0
)
{
maxOutputBoxesPerClass
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
0
];
}
if
(
field_name
.
compare
(
"iou_threshold"
)
==
0
)
{
iouThreshold
=
static_cast
<
const
float
*>
(
fc
->
fields
[
i
].
data
)[
0
];
}
if
(
field_name
.
compare
(
"score_threshold"
)
==
0
)
{
scoreThreshold
=
static_cast
<
const
float
*>
(
fc
->
fields
[
i
].
data
)[
0
];
}
if
(
field_name
.
compare
(
"offset"
)
==
0
)
{
offset
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
0
];
}
}
NonMaxSuppressionDynamic
*
plugin
=
new
NonMaxSuppressionDynamic
(
name
,
centerPointBox
,
maxOutputBoxesPerClass
,
iouThreshold
,
scoreThreshold
,
offset
);
plugin
->
setPluginNamespace
(
getPluginNamespace
());
return
plugin
;
}
nvinfer1
::
IPluginV2
*
NonMaxSuppressionDynamicCreator
::
deserializePlugin
(
const
char
*
name
,
const
void
*
serialData
,
size_t
serialLength
)
{
auto
plugin
=
new
NonMaxSuppressionDynamic
(
name
,
serialData
,
serialLength
);
plugin
->
setPluginNamespace
(
getPluginNamespace
());
return
plugin
;
}
void
NonMaxSuppressionDynamicCreator
::
setPluginNamespace
(
const
char
*
libNamespace
)
{
mNamespace
=
libNamespace
;
}
const
char
*
NonMaxSuppressionDynamicCreator
::
getPluginNamespace
()
const
{
return
mNamespace
.
c_str
();
}
mmcv/ops/csrc/tensorrt/plugins/trt_nms_kernel.cu
deleted
100644 → 0
View file @
961373ad
// Copyright (c) OpenMMLab. All rights reserved
#include <stdio.h>
#include <thrust/execution_policy.h>
#include <thrust/gather.h>
#include <thrust/sort.h>
#include <thrust/transform.h>
#include <chrono>
#include <thread>
#include <vector>
#include "common_cuda_helper.hpp"
#include "nms_cuda_kernel.cuh"
#include "trt_cuda_helper.cuh"
#include "trt_plugin_helper.hpp"
struct
NMSBox
{
float
box
[
4
];
};
struct
nms_centerwh2xyxy
{
__host__
__device__
NMSBox
operator
()(
const
NMSBox
box
)
{
NMSBox
out
;
out
.
box
[
0
]
=
box
.
box
[
0
]
-
box
.
box
[
2
]
/
2.0
f
;
out
.
box
[
1
]
=
box
.
box
[
1
]
-
box
.
box
[
3
]
/
2.0
f
;
out
.
box
[
2
]
=
box
.
box
[
0
]
+
box
.
box
[
2
]
/
2.0
f
;
out
.
box
[
3
]
=
box
.
box
[
1
]
+
box
.
box
[
3
]
/
2.0
f
;
return
out
;
}
};
struct
nms_sbox_idle
{
const
float
*
idle_box_
;
__host__
__device__
nms_sbox_idle
(
const
float
*
idle_box
)
{
idle_box_
=
idle_box
;
}
__host__
__device__
NMSBox
operator
()(
const
NMSBox
box
)
{
return
{
idle_box_
[
0
],
idle_box_
[
1
],
idle_box_
[
2
],
idle_box_
[
3
]};
}
};
struct
nms_score_threshold
{
float
score_threshold_
;
__host__
__device__
nms_score_threshold
(
const
float
score_threshold
)
{
score_threshold_
=
score_threshold
;
}
__host__
__device__
bool
operator
()(
const
float
score
)
{
return
score
<
score_threshold_
;
}
};
__global__
void
nms_reindex_kernel
(
int
n
,
int
*
output
,
int
*
index_cache
)
{
CUDA_1D_KERNEL_LOOP
(
index
,
n
)
{
const
int
old_index
=
output
[
index
*
3
+
2
];
output
[
index
*
3
+
2
]
=
index_cache
[
old_index
];
}
}
__global__
void
mask_to_output_kernel
(
const
unsigned
long
long
*
dev_mask
,
const
int
*
index
,
int
*
output
,
int
*
output_count
,
int
batch_id
,
int
cls_id
,
int
spatial_dimension
,
int
col_blocks
,
int
max_output_boxes_per_class
)
{
extern
__shared__
unsigned
long
long
remv
[];
// fill remv with 0
CUDA_1D_KERNEL_LOOP
(
i
,
col_blocks
)
{
remv
[
i
]
=
0
;
}
__syncthreads
();
int
start
=
*
output_count
;
int
out_per_class_count
=
0
;
for
(
int
i
=
0
;
i
<
spatial_dimension
;
i
++
)
{
const
int
nblock
=
i
/
threadsPerBlock
;
const
int
inblock
=
i
%
threadsPerBlock
;
if
(
!
(
remv
[
nblock
]
&
(
1ULL
<<
inblock
)))
{
if
(
threadIdx
.
x
==
0
)
{
output
[
start
*
3
+
0
]
=
batch_id
;
output
[
start
*
3
+
1
]
=
cls_id
;
output
[
start
*
3
+
2
]
=
index
[
i
];
start
+=
1
;
}
out_per_class_count
+=
1
;
if
(
out_per_class_count
>=
max_output_boxes_per_class
)
{
break
;
}
__syncthreads
();
// set every overlap box with bit 1 in remv
const
unsigned
long
long
*
p
=
dev_mask
+
i
*
col_blocks
;
CUDA_1D_KERNEL_LOOP
(
j
,
col_blocks
)
{
if
(
j
>=
nblock
)
{
remv
[
j
]
|=
p
[
j
];
}
}
// j
__syncthreads
();
}
}
// i
if
(
threadIdx
.
x
==
0
)
{
*
output_count
=
start
;
}
}
size_t
get_onnxnms_workspace_size
(
size_t
num_batches
,
size_t
spatial_dimension
,
size_t
num_classes
,
size_t
boxes_word_size
,
int
center_point_box
,
size_t
output_length
)
{
size_t
boxes_xyxy_workspace
=
0
;
if
(
center_point_box
==
1
)
{
boxes_xyxy_workspace
=
mmcv
::
getAlignedSize
(
num_batches
*
spatial_dimension
*
4
*
boxes_word_size
);
}
size_t
scores_workspace
=
mmcv
::
getAlignedSize
(
spatial_dimension
*
boxes_word_size
);
size_t
boxes_workspace
=
mmcv
::
getAlignedSize
(
spatial_dimension
*
4
*
boxes_word_size
);
const
int
col_blocks
=
(
spatial_dimension
+
threadsPerBlock
-
1
)
/
threadsPerBlock
;
size_t
mask_workspace
=
mmcv
::
getAlignedSize
(
spatial_dimension
*
col_blocks
*
sizeof
(
unsigned
long
long
));
size_t
index_template_workspace
=
mmcv
::
getAlignedSize
(
spatial_dimension
*
sizeof
(
int
));
size_t
index_workspace
=
mmcv
::
getAlignedSize
(
spatial_dimension
*
sizeof
(
int
));
size_t
count_workspace
=
mmcv
::
getAlignedSize
(
sizeof
(
int
));
return
scores_workspace
+
boxes_xyxy_workspace
+
boxes_workspace
+
mask_workspace
+
index_template_workspace
+
index_workspace
+
count_workspace
;
}
/**
* Launch the NonMaxSuppression kernel
*
* The NMS will be performed on each batch/class, share the kernel implement
* `nms_cuda`. For each batch/class, the `boxes_sorted` and `index_cache` will
* be sorted by scores, boxes_sorted will be used in `nms_cuda` kernel. After
* that, the output would be generated by `mask_to_output_kernel` with
* `dev_mask` and `sorted_cache`.
*
* @param[in] bboxes with shape [num_batch, spatial_dimension, 4], input boxes
* @param[in] scores with shape [num_batch, num_classes, spatial_dimension],
* input scores
* @param[in] max_output_boxes_per_class max output boxes per class
* @param[in] iou_threshold threshold of iou
* @param[in] score_threshold threshold of scores
* @param[in] offset box offset, only 0 or 1 is valid
* @param[out] output with shape [output_length, 3], each row contain index
* (batch_id, class_id, boxes_id), filling -1 if result is not valid.
* @param[in] center_point_box 0 if boxes is [left, top, right, bottom] 1 if
* boxes is [center_x, center_y, width, height]
* @param[in] num_batches batch size of boxes and scores
* @param[in] spatial_dimension boxes numbers each batch
* @param[in] num_classes class numbers
* @param[in] output_length the max output rows
* @param[in] workspace memory for all temporary variables.
* @param[in] stream cuda stream
*/
void
TRTNMSCUDAKernelLauncher_float
(
const
float
*
boxes
,
const
float
*
scores
,
const
int
max_output_boxes_per_class
,
const
float
iou_threshold
,
const
float
score_threshold
,
const
int
offset
,
int
*
output
,
int
center_point_box
,
int
num_batches
,
int
spatial_dimension
,
int
num_classes
,
size_t
output_length
,
void
*
workspace
,
cudaStream_t
stream
)
{
const
int
col_blocks
=
(
spatial_dimension
+
threadsPerBlock
-
1
)
/
threadsPerBlock
;
float
*
boxes_sorted
=
(
float
*
)
workspace
;
workspace
=
static_cast
<
char
*>
(
workspace
)
+
mmcv
::
getAlignedSize
(
spatial_dimension
*
4
*
sizeof
(
float
));
float
*
boxes_xyxy
=
nullptr
;
if
(
center_point_box
==
1
)
{
boxes_xyxy
=
(
float
*
)
workspace
;
workspace
=
static_cast
<
char
*>
(
workspace
)
+
mmcv
::
getAlignedSize
(
num_batches
*
spatial_dimension
*
4
*
sizeof
(
float
));
thrust
::
transform
(
thrust
::
cuda
::
par
.
on
(
stream
),
(
NMSBox
*
)
boxes
,
(
NMSBox
*
)(
boxes
+
num_batches
*
spatial_dimension
*
4
),
(
NMSBox
*
)
boxes_xyxy
,
nms_centerwh2xyxy
());
cudaCheckError
();
}
float
*
scores_sorted
=
(
float
*
)
workspace
;
workspace
=
static_cast
<
char
*>
(
workspace
)
+
mmcv
::
getAlignedSize
(
spatial_dimension
*
sizeof
(
float
));
unsigned
long
long
*
dev_mask
=
(
unsigned
long
long
*
)
workspace
;
workspace
=
static_cast
<
char
*>
(
workspace
)
+
mmcv
::
getAlignedSize
(
spatial_dimension
*
col_blocks
*
sizeof
(
unsigned
long
long
));
int
*
index_cache
=
(
int
*
)
workspace
;
workspace
=
static_cast
<
char
*>
(
workspace
)
+
mmcv
::
getAlignedSize
(
spatial_dimension
*
sizeof
(
int
));
// generate sequence [0,1,2,3,4 ....]
int
*
index_template
=
(
int
*
)
workspace
;
workspace
=
static_cast
<
char
*>
(
workspace
)
+
mmcv
::
getAlignedSize
(
spatial_dimension
*
sizeof
(
int
));
thrust
::
sequence
(
thrust
::
cuda
::
par
.
on
(
stream
),
index_template
,
index_template
+
spatial_dimension
,
0
);
int
max_output_boxes_per_class_cpu
=
max_output_boxes_per_class
;
if
(
max_output_boxes_per_class_cpu
<=
0
)
{
max_output_boxes_per_class_cpu
=
spatial_dimension
;
}
int
*
output_count
=
(
int
*
)
workspace
;
workspace
=
static_cast
<
char
*>
(
workspace
)
+
mmcv
::
getAlignedSize
(
sizeof
(
int
));
cudaMemsetAsync
(
output_count
,
0
,
sizeof
(
int
),
stream
);
// fill output with -1
thrust
::
fill
(
thrust
::
cuda
::
par
.
on
(
stream
),
output
,
output
+
output_length
*
3
,
-
1
);
cudaCheckError
();
dim3
blocks
(
col_blocks
,
col_blocks
);
dim3
threads
(
threadsPerBlock
);
for
(
int
batch_id
=
0
;
batch_id
<
num_batches
;
++
batch_id
)
{
for
(
int
cls_id
=
0
;
cls_id
<
num_classes
;
++
cls_id
)
{
const
int
batch_cls_id
=
batch_id
*
num_classes
+
cls_id
;
// sort boxes by score
cudaMemcpyAsync
(
scores_sorted
,
scores
+
batch_cls_id
*
spatial_dimension
,
spatial_dimension
*
sizeof
(
float
),
cudaMemcpyDeviceToDevice
,
stream
);
cudaCheckError
();
cudaMemcpyAsync
(
index_cache
,
index_template
,
spatial_dimension
*
sizeof
(
int
),
cudaMemcpyDeviceToDevice
,
stream
);
cudaCheckError
();
thrust
::
sort_by_key
(
thrust
::
cuda
::
par
.
on
(
stream
),
scores_sorted
,
scores_sorted
+
spatial_dimension
,
index_cache
,
thrust
::
greater
<
float
>
());
if
(
center_point_box
==
1
)
{
thrust
::
gather
(
thrust
::
cuda
::
par
.
on
(
stream
),
index_cache
,
index_cache
+
spatial_dimension
,
(
NMSBox
*
)(
boxes_xyxy
+
batch_id
*
spatial_dimension
*
4
),
(
NMSBox
*
)
boxes_sorted
);
}
else
{
thrust
::
gather
(
thrust
::
cuda
::
par
.
on
(
stream
),
index_cache
,
index_cache
+
spatial_dimension
,
(
NMSBox
*
)(
boxes
+
batch_id
*
spatial_dimension
*
4
),
(
NMSBox
*
)
boxes_sorted
);
}
cudaCheckError
();
if
(
score_threshold
>
0.0
f
)
{
thrust
::
transform_if
(
thrust
::
cuda
::
par
.
on
(
stream
),
(
NMSBox
*
)
boxes_sorted
,
(
NMSBox
*
)(
boxes_sorted
+
spatial_dimension
*
4
),
scores_sorted
,
(
NMSBox
*
)
boxes_sorted
,
nms_sbox_idle
(
boxes_sorted
),
nms_score_threshold
(
score_threshold
));
}
nms_cuda
<<<
blocks
,
threads
,
0
,
stream
>>>
(
spatial_dimension
,
iou_threshold
,
offset
,
boxes_sorted
,
dev_mask
);
// will be performed when dev_mask is full.
mask_to_output_kernel
<<<
1
,
threadsPerBlock
,
col_blocks
*
sizeof
(
unsigned
long
long
),
stream
>>>
(
dev_mask
,
index_cache
,
output
,
output_count
,
batch_id
,
cls_id
,
spatial_dimension
,
col_blocks
,
max_output_boxes_per_class_cpu
);
}
// cls_id
}
// batch_id
}
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment