Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
mmdeploy
Commits
546b4279
Commit
546b4279
authored
Jun 25, 2025
by
limm
Browse files
add csrc and mmdeploy module
parent
502f4fb9
Pipeline
#2810
canceled with stages
Changes
447
Pipelines
1
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2363 additions
and
0 deletions
+2363
-0
csrc/mmdeploy/backend_ops/onnxruntime/modulated_deform_conv/modulated_deform_conv.cpp
...nxruntime/modulated_deform_conv/modulated_deform_conv.cpp
+198
-0
csrc/mmdeploy/backend_ops/onnxruntime/modulated_deform_conv/modulated_deform_conv.h
...onnxruntime/modulated_deform_conv/modulated_deform_conv.h
+58
-0
csrc/mmdeploy/backend_ops/onnxruntime/nms_match/nms_match.cpp
.../mmdeploy/backend_ops/onnxruntime/nms_match/nms_match.cpp
+129
-0
csrc/mmdeploy/backend_ops/onnxruntime/nms_match/nms_match.h
csrc/mmdeploy/backend_ops/onnxruntime/nms_match/nms_match.h
+46
-0
csrc/mmdeploy/backend_ops/onnxruntime/nms_rotated/nms_rotated.cpp
...eploy/backend_ops/onnxruntime/nms_rotated/nms_rotated.cpp
+368
-0
csrc/mmdeploy/backend_ops/onnxruntime/nms_rotated/nms_rotated.h
...mdeploy/backend_ops/onnxruntime/nms_rotated/nms_rotated.h
+48
-0
csrc/mmdeploy/backend_ops/onnxruntime/onnxruntime_register.cpp
...mmdeploy/backend_ops/onnxruntime/onnxruntime_register.cpp
+27
-0
csrc/mmdeploy/backend_ops/onnxruntime/roi_align_rotated/roi_align_rotated.cpp
...d_ops/onnxruntime/roi_align_rotated/roi_align_rotated.cpp
+237
-0
csrc/mmdeploy/backend_ops/onnxruntime/roi_align_rotated/roi_align_rotated.h
...end_ops/onnxruntime/roi_align_rotated/roi_align_rotated.h
+59
-0
csrc/mmdeploy/backend_ops/tensorrt/CMakeLists.txt
csrc/mmdeploy/backend_ops/tensorrt/CMakeLists.txt
+37
-0
csrc/mmdeploy/backend_ops/tensorrt/batched_nms/trt_batched_nms.cpp
...ploy/backend_ops/tensorrt/batched_nms/trt_batched_nms.cpp
+233
-0
csrc/mmdeploy/backend_ops/tensorrt/batched_nms/trt_batched_nms.hpp
...ploy/backend_ops/tensorrt/batched_nms/trt_batched_nms.hpp
+82
-0
csrc/mmdeploy/backend_ops/tensorrt/batched_rotated_nms/trt_batched_rotated_nms.cpp
.../tensorrt/batched_rotated_nms/trt_batched_rotated_nms.cpp
+229
-0
csrc/mmdeploy/backend_ops/tensorrt/batched_rotated_nms/trt_batched_rotated_nms.hpp
.../tensorrt/batched_rotated_nms/trt_batched_rotated_nms.hpp
+78
-0
csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate.cpp
.../tensorrt/bicubic_interpolate/trt_bicubic_interpolate.cpp
+185
-0
csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate.hpp
.../tensorrt/bicubic_interpolate/trt_bicubic_interpolate.hpp
+67
-0
csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate_kernel.cu
...rrt/bicubic_interpolate/trt_bicubic_interpolate_kernel.cu
+170
-0
csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate_kernel.hpp
...rt/bicubic_interpolate/trt_bicubic_interpolate_kernel.hpp
+11
-0
csrc/mmdeploy/backend_ops/tensorrt/common/common_cuda_helper.hpp
...deploy/backend_ops/tensorrt/common/common_cuda_helper.hpp
+82
-0
csrc/mmdeploy/backend_ops/tensorrt/common/nms/batched_nms_kernel.hpp
...oy/backend_ops/tensorrt/common/nms/batched_nms_kernel.hpp
+19
-0
No files found.
Too many changes to show.
To preserve performance only
447 of 447+
files are displayed.
Plain diff
Email patch
csrc/mmdeploy/backend_ops/onnxruntime/modulated_deform_conv/modulated_deform_conv.cpp
0 → 100644
View file @
546b4279
// Copyright (c) OpenMMLab. All rights reserved
#include "modulated_deform_conv.h"
#include <cmath>
#include <thread>
#include <vector>
#include "modulated_deform_conv/modulated_deform_conv_cpu.h"
#include "ort_utils.h"
namespace
mmdeploy
{
void
parallel_unroll_gemm
(
const
float
*
A
,
const
float
*
B
,
const
float
*
V
,
const
float
*
H
,
const
int32_t
M
,
const
int32_t
N
,
const
int32_t
K
,
const
float
alpha
,
const
float
beta
,
float
*
Y
,
const
int32_t
start_row
,
const
int32_t
end_row
)
{
std
::
vector
<
float
>
tmp
(
N
);
for
(
int32_t
m
=
start_row
;
m
<
end_row
;
++
m
)
{
for
(
int32_t
n
=
0
;
n
<
N
;
n
++
)
{
tmp
[
n
]
=
0
;
}
{
int32_t
remainder
=
K
%
8
;
// unroll
for
(
int32_t
k
=
0
;
k
<
K
;
k
+=
8
)
{
for
(
int32_t
n
=
0
;
n
<
N
;
n
++
)
{
tmp
[
n
]
+=
A
[
m
*
K
+
k
]
*
B
[
k
*
N
+
n
];
tmp
[
n
]
+=
A
[
m
*
K
+
k
+
1
]
*
B
[
k
*
N
+
N
+
n
];
tmp
[
n
]
+=
A
[
m
*
K
+
k
+
2
]
*
B
[
k
*
N
+
2
*
N
+
n
];
tmp
[
n
]
+=
A
[
m
*
K
+
k
+
3
]
*
B
[
k
*
N
+
3
*
N
+
n
];
tmp
[
n
]
+=
A
[
m
*
K
+
k
+
4
]
*
B
[
k
*
N
+
4
*
N
+
n
];
tmp
[
n
]
+=
A
[
m
*
K
+
k
+
5
]
*
B
[
k
*
N
+
5
*
N
+
n
];
tmp
[
n
]
+=
A
[
m
*
K
+
k
+
6
]
*
B
[
k
*
N
+
6
*
N
+
n
];
tmp
[
n
]
+=
A
[
m
*
K
+
k
+
7
]
*
B
[
k
*
N
+
7
*
N
+
n
];
}
}
for
(
int32_t
k
=
K
-
remainder
;
k
<
K
;
k
++
)
{
for
(
int32_t
n
=
0
;
n
<
N
;
n
++
)
{
tmp
[
n
]
+=
A
[
m
*
K
+
k
]
*
B
[
k
*
N
+
n
];
}
}
}
for
(
int32_t
n
=
0
;
n
<
N
;
n
++
)
{
tmp
[
n
]
*=
alpha
;
if
(
V
)
tmp
[
n
]
+=
beta
*
V
[
n
];
if
(
H
)
tmp
[
n
]
+=
beta
*
H
[
m
*
N
+
n
];
Y
[
m
*
N
+
n
]
=
tmp
[
n
];
}
}
}
void
deformable_conv2d_ref_fp32
(
const
float
*
src
,
const
float
*
offset
,
const
float
*
mask
,
const
float
*
filter
,
const
float
*
bias
,
const
int64_t
batch
,
const
int64_t
src_c
,
const
int64_t
src_h
,
const
int64_t
src_w
,
const
int64_t
dst_c
,
const
int64_t
dst_h
,
const
int64_t
dst_w
,
const
int64_t
group
,
const
int64_t
offset_group
,
const
int64_t
channels
,
const
int64_t
num_output
,
const
int64_t
kernel_h
,
const
int64_t
kernel_w
,
const
int64_t
stride_h
,
const
int64_t
stride_w
,
const
int64_t
pad_h
,
const
int64_t
pad_w
,
const
int64_t
dilation_h
,
const
int64_t
dilation_w
,
float
*
columns
,
float
*
dst
)
{
const
int64_t
ic_per_gp
=
channels
/
group
;
const
int64_t
oc_per_gp
=
num_output
/
group
;
// Set up for launching threads
std
::
size_t
num_threads
=
std
::
thread
::
hardware_concurrency
();
std
::
vector
<
std
::
thread
>
threads
;
threads
.
reserve
(
num_threads
);
for
(
int64_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
int64_t
g
=
0
;
g
<
group
;
++
g
)
{
deformable_im2col_2d
<
float
>
(
src
+
b
*
src_c
*
src_h
*
src_w
+
g
*
ic_per_gp
*
src_h
*
src_w
,
offset
+
b
*
offset_group
*
2
*
kernel_h
*
kernel_w
*
dst_h
*
dst_w
,
mask
+
b
*
offset_group
*
kernel_h
*
kernel_w
*
dst_h
*
dst_w
,
src_h
,
src_w
,
kernel_h
,
kernel_w
,
pad_h
,
pad_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
ic_per_gp
,
offset_group
,
dst_h
,
dst_w
,
mask
!=
nullptr
,
columns
);
float
*
dst_ptr
=
dst
+
b
*
dst_c
*
dst_h
*
dst_w
+
g
*
oc_per_gp
*
dst_h
*
dst_w
;
if
(
bias
!=
nullptr
)
{
const
float
*
bias_ptr
=
bias
+
g
*
oc_per_gp
;
for
(
int64_t
oc
=
0
;
oc
<
oc_per_gp
;
++
oc
)
{
for
(
int64_t
hw
=
0
;
hw
<
dst_h
*
dst_w
;
++
hw
)
{
dst_ptr
[
oc
*
dst_h
*
dst_w
+
hw
]
=
bias_ptr
[
oc
];
}
}
}
else
{
memset
(
dst_ptr
,
0.0
f
,
sizeof
(
float
)
*
oc_per_gp
*
dst_h
*
dst_w
);
}
if
(
num_threads
>
1
)
{
// Calculate values to pass to threads
int32_t
n_rows
=
(
oc_per_gp
+
num_threads
-
1
)
/
num_threads
;
int32_t
end_row
=
0
;
for
(
int32_t
i
=
0
;
i
<
num_threads
;
i
++
)
{
auto
start_row
=
i
*
n_rows
;
end_row
=
start_row
+
n_rows
;
if
(
end_row
>
oc_per_gp
)
end_row
=
oc_per_gp
;
std
::
thread
t
(
parallel_unroll_gemm
,
filter
+
g
*
oc_per_gp
*
ic_per_gp
*
kernel_h
*
kernel_w
,
columns
,
nullptr
,
dst_ptr
,
oc_per_gp
,
dst_h
*
dst_w
,
ic_per_gp
*
kernel_h
*
kernel_w
,
1.0
f
,
1.0
f
,
dst_ptr
,
start_row
,
end_row
);
threads
.
emplace_back
(
std
::
move
(
t
));
}
// Wait for all threads to complete
for
(
auto
&
t
:
threads
)
t
.
join
();
threads
.
clear
();
}
else
{
// parallel gemm degrade to serial gemm with start_row=0 and end_row= oc_per_gp
parallel_unroll_gemm
(
filter
+
g
*
oc_per_gp
*
ic_per_gp
*
kernel_h
*
kernel_w
,
columns
,
nullptr
,
dst_ptr
,
oc_per_gp
,
dst_h
*
dst_w
,
ic_per_gp
*
kernel_h
*
kernel_w
,
1.0
f
,
1.0
f
,
dst_ptr
,
0
,
oc_per_gp
);
}
}
}
}
MMCVModulatedDeformConvKernel
::
MMCVModulatedDeformConvKernel
(
const
OrtApi
&
api
,
const
OrtKernelInfo
*
info
)
:
ort_
(
api
),
info_
(
info
)
{
std
::
vector
<
int64_t
>
stride
=
ort_
.
KernelInfoGetAttribute
<
std
::
vector
<
int64_t
>>
(
info
,
"stride"
);
stride_height_
=
stride
[
0
];
stride_width_
=
stride
[
1
];
std
::
vector
<
int64_t
>
padding
=
ort_
.
KernelInfoGetAttribute
<
std
::
vector
<
int64_t
>>
(
info
,
"padding"
);
padding_height_
=
padding
[
0
];
padding_width_
=
padding
[
1
];
std
::
vector
<
int64_t
>
dilation
=
ort_
.
KernelInfoGetAttribute
<
std
::
vector
<
int64_t
>>
(
info
,
"dilation"
);
dilation_height_
=
dilation
[
0
];
dilation_width_
=
dilation
[
1
];
deformable_group_
=
ort_
.
KernelInfoGetAttribute
<
int64_t
>
(
info
,
"deform_groups"
);
group_
=
ort_
.
KernelInfoGetAttribute
<
int64_t
>
(
info
,
"groups"
);
// create allocator
allocator_
=
Ort
::
AllocatorWithDefaultOptions
();
}
void
MMCVModulatedDeformConvKernel
::
Compute
(
OrtKernelContext
*
context
)
{
const
int64_t
stride_height
=
stride_height_
;
const
int64_t
stride_width
=
stride_width_
;
const
int64_t
padding_height
=
padding_height_
;
const
int64_t
padding_width
=
padding_width_
;
const
int64_t
dilation_height
=
dilation_height_
;
const
int64_t
dilation_width
=
dilation_width_
;
const
int64_t
deformable_group
=
deformable_group_
;
const
int64_t
group
=
group_
;
const
OrtValue
*
input
=
ort_
.
KernelContext_GetInput
(
context
,
0
);
const
float
*
input_data
=
reinterpret_cast
<
const
float
*>
(
ort_
.
GetTensorData
<
float
>
(
input
));
const
OrtValue
*
offset
=
ort_
.
KernelContext_GetInput
(
context
,
1
);
const
float
*
offset_data
=
reinterpret_cast
<
const
float
*>
(
ort_
.
GetTensorData
<
float
>
(
offset
));
const
OrtValue
*
mask
=
ort_
.
KernelContext_GetInput
(
context
,
2
);
const
float
*
mask_data
=
reinterpret_cast
<
const
float
*>
(
ort_
.
GetTensorData
<
float
>
(
mask
));
const
OrtValue
*
filter
=
ort_
.
KernelContext_GetInput
(
context
,
3
);
const
float
*
filter_data
=
reinterpret_cast
<
const
float
*>
(
ort_
.
GetTensorData
<
float
>
(
filter
));
const
OrtValue
*
bias
=
ort_
.
KernelContext_GetInput
(
context
,
4
);
const
float
*
bias_data
=
(
bias
!=
nullptr
)
?
reinterpret_cast
<
const
float
*>
(
ort_
.
GetTensorData
<
float
>
(
bias
))
:
nullptr
;
// const float *bias_data = nullptr;
OrtTensorDimensions
input_dims
(
ort_
,
input
);
OrtTensorDimensions
filter_dims
(
ort_
,
filter
);
int64_t
batch
=
input_dims
[
0
];
int64_t
channels
=
input_dims
[
1
];
int64_t
in_height
=
input_dims
[
2
];
int64_t
in_width
=
input_dims
[
3
];
int64_t
num_output
=
filter_dims
[
0
];
int64_t
kernel_height
=
filter_dims
[
2
];
int64_t
kernel_width
=
filter_dims
[
3
];
// get output memory
int64_t
out_height
=
floor
(
(
in_height
+
2
*
padding_height
-
dilation_height
*
(
kernel_height
-
1
)
-
1
)
/
stride_height
+
1
);
int64_t
out_width
=
floor
(
(
in_width
+
2
*
padding_width
-
dilation_width
*
(
kernel_width
-
1
)
-
1
)
/
stride_width
+
1
);
std
::
vector
<
int64_t
>
output_dims
=
{
batch
,
num_output
,
out_height
,
out_width
};
OrtValue
*
output
=
ort_
.
KernelContext_GetOutput
(
context
,
0
,
output_dims
.
data
(),
output_dims
.
size
());
float
*
out_ptr
=
ort_
.
GetTensorMutableData
<
float
>
(
output
);
// allocate tmp memory
int64_t
column_len
=
(
channels
/
group
)
*
kernel_height
*
kernel_width
*
out_height
*
out_width
;
float
*
columns
=
(
float
*
)
allocator_
.
Alloc
(
sizeof
(
float
)
*
column_len
);
deformable_conv2d_ref_fp32
(
input_data
,
offset_data
,
mask_data
,
filter_data
,
bias_data
,
batch
,
channels
,
in_height
,
in_width
,
num_output
,
out_height
,
out_width
,
group
,
deformable_group
,
channels
,
num_output
,
kernel_height
,
kernel_width
,
stride_height
,
stride_width
,
padding_height
,
padding_width
,
dilation_height
,
dilation_width
,
columns
,
out_ptr
);
allocator_
.
Free
(
columns
);
}
REGISTER_ONNXRUNTIME_OPS
(
mmdeploy
,
MMCVModulatedDeformConvOp
);
REGISTER_ONNXRUNTIME_OPS
(
mmcv
,
MMCVModulatedDeformConvOp
);
}
// namespace mmdeploy
csrc/mmdeploy/backend_ops/onnxruntime/modulated_deform_conv/modulated_deform_conv.h
0 → 100644
View file @
546b4279
// Copyright (c) OpenMMLab. All rights reserved
#ifndef ONNXRUNTIME_MODULATED_DEFORM_CONV_H
#define ONNXRUNTIME_MODULATED_DEFORM_CONV_H
#include <onnxruntime_cxx_api.h>
namespace
mmdeploy
{
struct
MMCVModulatedDeformConvKernel
{
MMCVModulatedDeformConvKernel
(
const
OrtApi
&
api
,
const
OrtKernelInfo
*
info
);
void
Compute
(
OrtKernelContext
*
context
);
protected:
Ort
::
CustomOpApi
ort_
;
const
OrtKernelInfo
*
info_
;
Ort
::
AllocatorWithDefaultOptions
allocator_
;
int64_t
stride_height_
;
int64_t
stride_width_
;
int64_t
padding_height_
;
int64_t
padding_width_
;
int64_t
dilation_height_
;
int64_t
dilation_width_
;
int64_t
deformable_group_
;
int64_t
group_
;
};
struct
MMCVModulatedDeformConvOp
:
Ort
::
CustomOpBase
<
MMCVModulatedDeformConvOp
,
MMCVModulatedDeformConvKernel
>
{
void
*
CreateKernel
(
const
OrtApi
&
api
,
const
OrtKernelInfo
*
info
)
const
{
return
new
MMCVModulatedDeformConvKernel
(
api
,
info
);
}
const
char
*
GetName
()
const
{
return
"MMCVModulatedDeformConv2d"
;
};
size_t
GetInputTypeCount
()
const
{
return
5
;
};
ONNXTensorElementDataType
GetInputType
(
size_t
/*index*/
)
const
{
return
ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT
;
};
OrtCustomOpInputOutputCharacteristic
GetInputCharacteristic
(
size_t
index
)
const
{
// The last input (index == 4) is optional, which is bias
if
(
index
==
4
)
return
OrtCustomOpInputOutputCharacteristic
::
INPUT_OUTPUT_OPTIONAL
;
return
OrtCustomOpInputOutputCharacteristic
::
INPUT_OUTPUT_REQUIRED
;
}
size_t
GetOutputTypeCount
()
const
{
return
1
;
};
ONNXTensorElementDataType
GetOutputType
(
size_t
/*index*/
)
const
{
return
ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT
;
};
// force cpu
const
char
*
GetExecutionProviderType
()
const
{
return
"CPUExecutionProvider"
;
};
};
}
// namespace mmdeploy
#endif
csrc/mmdeploy/backend_ops/onnxruntime/nms_match/nms_match.cpp
0 → 100644
View file @
546b4279
// Copyright (c) OpenMMLab. All rights reserved
#include "nms_match.h"
#include <assert.h>
#include <algorithm>
#include <cassert>
#include <cmath>
#include <iostream>
#include <iterator>
#include <numeric>
#include <vector>
#include "ort_utils.h"
namespace
mmdeploy
{
struct
Box
{
float
x1
,
y1
,
x2
,
y2
;
};
float
nms_match_iou
(
Box
box1
,
Box
box2
)
{
auto
inter_x1
=
std
::
max
(
box1
.
x1
,
box2
.
x1
);
auto
inter_y1
=
std
::
max
(
box1
.
y1
,
box2
.
y1
);
auto
inter_x2
=
std
::
min
(
box1
.
x2
,
box2
.
x2
);
auto
inter_y2
=
std
::
min
(
box1
.
y2
,
box2
.
y2
);
auto
eps
=
1e-10
;
auto
w
=
std
::
max
(
static_cast
<
float
>
(
0
),
inter_x2
-
inter_x1
);
auto
h
=
std
::
max
(
static_cast
<
float
>
(
0
),
inter_y2
-
inter_y1
);
auto
area1
=
(
box1
.
x2
-
box1
.
x1
)
*
(
box1
.
y2
-
box1
.
y1
);
auto
area2
=
(
box2
.
x2
-
box2
.
x1
)
*
(
box2
.
y2
-
box2
.
y1
);
auto
inter
=
w
*
h
;
auto
ovr
=
inter
/
(
area1
+
area2
-
inter
+
eps
);
return
ovr
;
}
NMSMatchKernel
::
NMSMatchKernel
(
const
OrtApi
&
api
,
const
OrtKernelInfo
*
info
)
:
ort_
(
api
),
info_
(
info
)
{
// create allocator
allocator_
=
Ort
::
AllocatorWithDefaultOptions
();
}
void
NMSMatchKernel
::
Compute
(
OrtKernelContext
*
context
)
{
const
OrtValue
*
boxes
=
ort_
.
KernelContext_GetInput
(
context
,
0
);
const
float
*
boxes_data
=
reinterpret_cast
<
const
float
*>
(
ort_
.
GetTensorData
<
float
>
(
boxes
));
const
OrtValue
*
scores
=
ort_
.
KernelContext_GetInput
(
context
,
1
);
const
float
*
scores_data
=
reinterpret_cast
<
const
float
*>
(
ort_
.
GetTensorData
<
float
>
(
scores
));
const
OrtValue
*
iou_threshold_
=
ort_
.
KernelContext_GetInput
(
context
,
2
);
const
float
iou_threshold_data
=
ort_
.
GetTensorData
<
float
>
(
iou_threshold_
)[
0
];
const
OrtValue
*
score_threshold_
=
ort_
.
KernelContext_GetInput
(
context
,
3
);
const
float
score_threshold_data
=
ort_
.
GetTensorData
<
float
>
(
score_threshold_
)[
0
];
OrtTensorDimensions
boxes_dim
(
ort_
,
boxes
);
OrtTensorDimensions
scores_dim
(
ort_
,
scores
);
// loop over batch
int64_t
nbatch
=
boxes_dim
[
0
];
int64_t
nboxes
=
boxes_dim
[
1
];
int64_t
nclass
=
scores_dim
[
1
];
assert
(
boxes_dim
[
2
]
==
4
);
//(x1, x2, y1, y2)
// alloc some temp memory
bool
*
select
=
(
bool
*
)
allocator_
.
Alloc
(
sizeof
(
bool
)
*
nbatch
*
nboxes
);
std
::
vector
<
int64_t
>
res_order
;
for
(
int64_t
k
=
0
;
k
<
nbatch
;
k
++
)
{
for
(
int64_t
g
=
0
;
g
<
nclass
;
g
++
)
{
for
(
int64_t
i
=
0
;
i
<
nboxes
;
i
++
)
{
select
[
i
]
=
true
;
}
// scores
// k * nboxes * nclass means per batch
// g * nboxes means per class
// batch = 2 boxes = 3 classes = 4
std
::
vector
<
float
>
tmp_sc
;
// get the class scores
for
(
int
i
=
0
;
i
<
nboxes
;
i
++
)
{
tmp_sc
.
push_back
(
scores_data
[
k
*
nboxes
*
nclass
+
g
*
nboxes
+
i
]);
}
std
::
vector
<
int64_t
>
order
(
tmp_sc
.
size
());
std
::
iota
(
order
.
begin
(),
order
.
end
(),
0
);
std
::
sort
(
order
.
begin
(),
order
.
end
(),
[
&
tmp_sc
](
int64_t
id1
,
int64_t
id2
)
{
return
tmp_sc
[
id1
]
>
tmp_sc
[
id2
];
});
for
(
int64_t
_i
=
0
;
_i
<
nboxes
;
_i
++
)
{
auto
i
=
order
[
_i
];
if
(
select
[
i
]
==
false
)
continue
;
std
::
vector
<
int64_t
>
v_i
;
for
(
int64_t
_j
=
_i
+
1
;
_j
<
nboxes
;
_j
++
)
{
auto
j
=
order
[
_j
];
if
(
select
[
j
]
==
false
)
continue
;
Box
vbox1
,
vbox2
;
vbox1
.
x1
=
boxes_data
[
k
*
nboxes
*
4
+
i
*
4
];
vbox1
.
y1
=
boxes_data
[
k
*
nboxes
*
4
+
i
*
4
+
1
];
vbox1
.
x2
=
boxes_data
[
k
*
nboxes
*
4
+
i
*
4
+
2
];
vbox1
.
y2
=
boxes_data
[
k
*
nboxes
*
4
+
i
*
4
+
3
];
vbox2
.
x1
=
boxes_data
[
k
*
nboxes
*
4
+
j
*
4
];
vbox2
.
y1
=
boxes_data
[
k
*
nboxes
*
4
+
j
*
4
+
1
];
vbox2
.
x2
=
boxes_data
[
k
*
nboxes
*
4
+
j
*
4
+
2
];
vbox2
.
y2
=
boxes_data
[
k
*
nboxes
*
4
+
j
*
4
+
3
];
auto
ovr
=
nms_match_iou
(
vbox1
,
vbox2
);
if
(
ovr
>=
iou_threshold_data
)
{
select
[
j
]
=
false
;
v_i
.
push_back
(
j
);
}
}
if
(
tmp_sc
[
i
]
>
score_threshold_data
&&
v_i
.
size
()
!=
0
)
{
for
(
int
v_i_idx
=
0
;
v_i_idx
<
v_i
.
size
();
v_i_idx
++
)
{
res_order
.
push_back
(
k
);
res_order
.
push_back
(
g
);
res_order
.
push_back
(
i
);
res_order
.
push_back
(
v_i
[
v_i_idx
]);
}
}
}
}
}
std
::
vector
<
int64_t
>
inds_dims
({(
int64_t
)
res_order
.
size
()
/
4
,
4
});
OrtValue
*
res
=
ort_
.
KernelContext_GetOutput
(
context
,
0
,
inds_dims
.
data
(),
inds_dims
.
size
());
int64_t
*
res_data
=
ort_
.
GetTensorMutableData
<
int64_t
>
(
res
);
memcpy
(
res_data
,
res_order
.
data
(),
sizeof
(
int64_t
)
*
res_order
.
size
());
allocator_
.
Free
(
select
);
}
REGISTER_ONNXRUNTIME_OPS
(
mmdeploy
,
NMSMatchOp
);
}
// namespace mmdeploy
csrc/mmdeploy/backend_ops/onnxruntime/nms_match/nms_match.h
0 → 100644
View file @
546b4279
// Copyright (c) OpenMMLab. All rights reserved.
#ifndef ONNXRUNTIME_NMS_MATCH_H
#define ONNXRUNTIME_NMS_MATCH_H
#include <assert.h>
#include <onnxruntime_cxx_api.h>
#include <cmath>
#include <mutex>
#include <string>
#include <vector>
namespace
mmdeploy
{
struct
NMSMatchKernel
{
NMSMatchKernel
(
const
OrtApi
&
api
,
const
OrtKernelInfo
*
info
);
void
Compute
(
OrtKernelContext
*
context
);
private:
Ort
::
CustomOpApi
ort_
;
const
OrtKernelInfo
*
info_
;
Ort
::
AllocatorWithDefaultOptions
allocator_
;
};
struct
NMSMatchOp
:
Ort
::
CustomOpBase
<
NMSMatchOp
,
NMSMatchKernel
>
{
void
*
CreateKernel
(
const
OrtApi
&
api
,
const
OrtKernelInfo
*
info
)
const
{
return
new
NMSMatchKernel
(
api
,
info
);
}
const
char
*
GetName
()
const
{
return
"NMSMatch"
;
}
size_t
GetInputTypeCount
()
const
{
return
4
;
}
ONNXTensorElementDataType
GetInputType
(
size_t
)
const
{
return
ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT
;
}
size_t
GetOutputTypeCount
()
const
{
return
1
;
}
ONNXTensorElementDataType
GetOutputType
(
size_t
)
const
{
return
ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64
;
}
// force cpu
const
char
*
GetExecutionProviderType
()
const
{
return
"CPUExecutionProvider"
;
}
};
}
// namespace mmdeploy
#endif // ONNXRUNTIME_NMS_MATCH_H
csrc/mmdeploy/backend_ops/onnxruntime/nms_rotated/nms_rotated.cpp
0 → 100644
View file @
546b4279
// Copyright (c) OpenMMLab. All rights reserved
#include "nms_rotated.h"
#include <assert.h>
#include <algorithm>
#include <cassert>
#include <cmath>
#include <iostream>
#include <iterator>
#include <numeric> // std::iota
#include <vector>
#include "ort_utils.h"
namespace
mmdeploy
{
namespace
{
struct
RotatedBox
{
float
x_ctr
,
y_ctr
,
w
,
h
,
a
;
};
struct
Point
{
float
x
,
y
;
Point
(
const
float
&
px
=
0
,
const
float
&
py
=
0
)
:
x
(
px
),
y
(
py
)
{}
Point
operator
+
(
const
Point
&
p
)
const
{
return
Point
(
x
+
p
.
x
,
y
+
p
.
y
);
}
Point
&
operator
+=
(
const
Point
&
p
)
{
x
+=
p
.
x
;
y
+=
p
.
y
;
return
*
this
;
}
Point
operator
-
(
const
Point
&
p
)
const
{
return
Point
(
x
-
p
.
x
,
y
-
p
.
y
);
}
Point
operator
*
(
const
float
coeff
)
const
{
return
Point
(
x
*
coeff
,
y
*
coeff
);
}
};
float
dot_2d
(
const
Point
&
A
,
const
Point
&
B
)
{
return
A
.
x
*
B
.
x
+
A
.
y
*
B
.
y
;
}
float
cross_2d
(
const
Point
&
A
,
const
Point
&
B
)
{
return
A
.
x
*
B
.
y
-
B
.
x
*
A
.
y
;
}
}
// namespace
void
get_rotated_vertices
(
const
RotatedBox
&
box
,
Point
(
&
pts
)[
4
])
{
// M_PI / 180. == 0.01745329251
// double theta = box.a * 0.01745329251;
// MODIFIED
double
theta
=
box
.
a
;
float
cosTheta2
=
(
float
)
cos
(
theta
)
*
0.5
f
;
float
sinTheta2
=
(
float
)
sin
(
theta
)
*
0.5
f
;
// y: top --> down; x: left --> right
pts
[
0
].
x
=
box
.
x_ctr
-
sinTheta2
*
box
.
h
-
cosTheta2
*
box
.
w
;
pts
[
0
].
y
=
box
.
y_ctr
+
cosTheta2
*
box
.
h
-
sinTheta2
*
box
.
w
;
pts
[
1
].
x
=
box
.
x_ctr
+
sinTheta2
*
box
.
h
-
cosTheta2
*
box
.
w
;
pts
[
1
].
y
=
box
.
y_ctr
-
cosTheta2
*
box
.
h
-
sinTheta2
*
box
.
w
;
pts
[
2
].
x
=
2
*
box
.
x_ctr
-
pts
[
0
].
x
;
pts
[
2
].
y
=
2
*
box
.
y_ctr
-
pts
[
0
].
y
;
pts
[
3
].
x
=
2
*
box
.
x_ctr
-
pts
[
1
].
x
;
pts
[
3
].
y
=
2
*
box
.
y_ctr
-
pts
[
1
].
y
;
}
int
get_intersection_points
(
const
Point
(
&
pts1
)[
4
],
const
Point
(
&
pts2
)[
4
],
Point
(
&
intersections
)[
24
])
{
// Line vector
// A line from p1 to p2 is: p1 + (p2-p1)*t, t=[0,1]
Point
vec1
[
4
],
vec2
[
4
];
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
vec1
[
i
]
=
pts1
[(
i
+
1
)
%
4
]
-
pts1
[
i
];
vec2
[
i
]
=
pts2
[(
i
+
1
)
%
4
]
-
pts2
[
i
];
}
// Line test - test all line combos for intersection
int
num
=
0
;
// number of intersections
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
for
(
int
j
=
0
;
j
<
4
;
j
++
)
{
// Solve for 2x2 Ax=b
float
det
=
cross_2d
(
vec2
[
j
],
vec1
[
i
]);
// This takes care of parallel lines
if
(
fabs
(
det
)
<=
1e-14
)
{
continue
;
}
auto
vec12
=
pts2
[
j
]
-
pts1
[
i
];
float
t1
=
cross_2d
(
vec2
[
j
],
vec12
)
/
det
;
float
t2
=
cross_2d
(
vec1
[
i
],
vec12
)
/
det
;
if
(
t1
>=
0.0
f
&&
t1
<=
1.0
f
&&
t2
>=
0.0
f
&&
t2
<=
1.0
f
)
{
intersections
[
num
++
]
=
pts1
[
i
]
+
vec1
[
i
]
*
t1
;
}
}
}
// Check for vertices of rect1 inside rect2
{
const
auto
&
AB
=
vec2
[
0
];
const
auto
&
DA
=
vec2
[
3
];
auto
ABdotAB
=
dot_2d
(
AB
,
AB
);
auto
ADdotAD
=
dot_2d
(
DA
,
DA
);
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
// assume ABCD is the rectangle, and P is the point to be judged
// P is inside ABCD iff. P's projection on AB lies within AB
// and P's projection on AD lies within AD
auto
AP
=
pts1
[
i
]
-
pts2
[
0
];
auto
APdotAB
=
dot_2d
(
AP
,
AB
);
auto
APdotAD
=
-
dot_2d
(
AP
,
DA
);
if
((
APdotAB
>=
0
)
&&
(
APdotAD
>=
0
)
&&
(
APdotAB
<=
ABdotAB
)
&&
(
APdotAD
<=
ADdotAD
))
{
intersections
[
num
++
]
=
pts1
[
i
];
}
}
}
// Reverse the check - check for vertices of rect2 inside rect1
{
const
auto
&
AB
=
vec1
[
0
];
const
auto
&
DA
=
vec1
[
3
];
auto
ABdotAB
=
dot_2d
(
AB
,
AB
);
auto
ADdotAD
=
dot_2d
(
DA
,
DA
);
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
auto
AP
=
pts2
[
i
]
-
pts1
[
0
];
auto
APdotAB
=
dot_2d
(
AP
,
AB
);
auto
APdotAD
=
-
dot_2d
(
AP
,
DA
);
if
((
APdotAB
>=
0
)
&&
(
APdotAD
>=
0
)
&&
(
APdotAB
<=
ABdotAB
)
&&
(
APdotAD
<=
ADdotAD
))
{
intersections
[
num
++
]
=
pts2
[
i
];
}
}
}
return
num
;
}
int
convex_hull_graham
(
const
Point
(
&
p
)[
24
],
const
int
&
num_in
,
Point
(
&
q
)[
24
],
bool
shift_to_zero
=
false
)
{
assert
(
num_in
>=
2
);
// Step 1:
// Find point with minimum y
// if more than 1 points have the same minimum y,
// pick the one with the minimum x.
int
t
=
0
;
for
(
int
i
=
1
;
i
<
num_in
;
i
++
)
{
if
(
p
[
i
].
y
<
p
[
t
].
y
||
(
p
[
i
].
y
==
p
[
t
].
y
&&
p
[
i
].
x
<
p
[
t
].
x
))
{
t
=
i
;
}
}
auto
&
start
=
p
[
t
];
// starting point
// Step 2:
// Subtract starting point from every points (for sorting in the next step)
for
(
int
i
=
0
;
i
<
num_in
;
i
++
)
{
q
[
i
]
=
p
[
i
]
-
start
;
}
// Swap the starting point to position 0
auto
tmp
=
q
[
0
];
q
[
0
]
=
q
[
t
];
q
[
t
]
=
tmp
;
// Step 3:
// Sort point 1 ~ num_in according to their relative cross-product values
// (essentially sorting according to angles)
// If the angles are the same, sort according to their distance to origin
float
dist
[
24
];
for
(
int
i
=
0
;
i
<
num_in
;
i
++
)
{
dist
[
i
]
=
dot_2d
(
q
[
i
],
q
[
i
]);
}
// CPU version
std
::
sort
(
q
+
1
,
q
+
num_in
,
[](
const
Point
&
A
,
const
Point
&
B
)
->
bool
{
float
temp
=
cross_2d
(
A
,
B
);
if
(
fabs
(
temp
)
<
1e-6
)
{
return
dot_2d
(
A
,
A
)
<
dot_2d
(
B
,
B
);
}
else
{
return
temp
>
0
;
}
});
// compute distance to origin after sort, since the points are now different.
for
(
int
i
=
0
;
i
<
num_in
;
i
++
)
{
dist
[
i
]
=
dot_2d
(
q
[
i
],
q
[
i
]);
}
// Step 4:
// Make sure there are at least 2 points (that don't overlap with each other)
// in the stack
int
k
;
// index of the non-overlapped second point
for
(
k
=
1
;
k
<
num_in
;
k
++
)
{
if
(
dist
[
k
]
>
1e-8
)
{
break
;
}
}
if
(
k
==
num_in
)
{
// We reach the end, which means the convex hull is just one point
q
[
0
]
=
p
[
t
];
return
1
;
}
q
[
1
]
=
q
[
k
];
int
m
=
2
;
// 2 points in the stack
// Step 5:
// Finally we can start the scanning process.
// When a non-convex relationship between the 3 points is found
// (either concave shape or duplicated points),
// we pop the previous point from the stack
// until the 3-point relationship is convex again, or
// until the stack only contains two points
for
(
int
i
=
k
+
1
;
i
<
num_in
;
i
++
)
{
while
(
m
>
1
&&
cross_2d
(
q
[
i
]
-
q
[
m
-
2
],
q
[
m
-
1
]
-
q
[
m
-
2
])
>=
0
)
{
m
--
;
}
q
[
m
++
]
=
q
[
i
];
}
// Step 6 (Optional):
// In general sense we need the original coordinates, so we
// need to shift the points back (reverting Step 2)
// But if we're only interested in getting the area/perimeter of the shape
// We can simply return.
if
(
!
shift_to_zero
)
{
for
(
int
i
=
0
;
i
<
m
;
i
++
)
{
q
[
i
]
+=
start
;
}
}
return
m
;
}
float
polygon_area
(
const
Point
(
&
q
)[
24
],
const
int
&
m
)
{
if
(
m
<=
2
)
{
return
0
;
}
float
area
=
0
;
for
(
int
i
=
1
;
i
<
m
-
1
;
i
++
)
{
area
+=
fabs
(
cross_2d
(
q
[
i
]
-
q
[
0
],
q
[
i
+
1
]
-
q
[
0
]));
}
return
area
/
2.0
;
}
float
rotated_boxes_intersection
(
const
RotatedBox
&
box1
,
const
RotatedBox
&
box2
)
{
// There are up to 4 x 4 + 4 + 4 = 24 intersections (including dups) returned
// from rotated_rect_intersection_pts
Point
intersectPts
[
24
],
orderedPts
[
24
];
Point
pts1
[
4
];
Point
pts2
[
4
];
get_rotated_vertices
(
box1
,
pts1
);
get_rotated_vertices
(
box2
,
pts2
);
int
num
=
get_intersection_points
(
pts1
,
pts2
,
intersectPts
);
if
(
num
<=
2
)
{
return
0.0
;
}
// Convex Hull to order the intersection points in clockwise order and find
// the contour area.
int
num_convex
=
convex_hull_graham
(
intersectPts
,
num
,
orderedPts
,
true
);
return
polygon_area
(
orderedPts
,
num_convex
);
}
NMSRotatedKernel
::
NMSRotatedKernel
(
const
OrtApi
&
api
,
const
OrtKernelInfo
*
info
)
:
ort_
(
api
),
info_
(
info
)
{
iou_threshold_
=
ort_
.
KernelInfoGetAttribute
<
float
>
(
info
,
"iou_threshold"
);
score_threshold_
=
ort_
.
KernelInfoGetAttribute
<
float
>
(
info
,
"score_threshold"
);
// create allocator
allocator_
=
Ort
::
AllocatorWithDefaultOptions
();
}
void
NMSRotatedKernel
::
Compute
(
OrtKernelContext
*
context
)
{
const
float
iou_threshold
=
iou_threshold_
;
const
float
score_threshold
=
score_threshold_
;
const
OrtValue
*
boxes
=
ort_
.
KernelContext_GetInput
(
context
,
0
);
const
float
*
boxes_data
=
reinterpret_cast
<
const
float
*>
(
ort_
.
GetTensorData
<
float
>
(
boxes
));
const
OrtValue
*
scores
=
ort_
.
KernelContext_GetInput
(
context
,
1
);
const
float
*
scores_data
=
reinterpret_cast
<
const
float
*>
(
ort_
.
GetTensorData
<
float
>
(
scores
));
OrtTensorDimensions
boxes_dim
(
ort_
,
boxes
);
OrtTensorDimensions
scores_dim
(
ort_
,
scores
);
// loop over batch
int64_t
nbatch
=
boxes_dim
[
0
];
int64_t
nboxes
=
boxes_dim
[
1
];
int64_t
nclass
=
scores_dim
[
1
];
assert
(
boxes_dim
[
2
]
==
5
);
//(cx,cy,w,h,theta)
// allocate tmp memory
float
*
tmp_boxes
=
(
float
*
)
allocator_
.
Alloc
(
sizeof
(
float
)
*
nbatch
*
nboxes
*
5
);
float
*
sc
=
(
float
*
)
allocator_
.
Alloc
(
sizeof
(
float
)
*
nbatch
*
nclass
*
nboxes
);
bool
*
select
=
(
bool
*
)
allocator_
.
Alloc
(
sizeof
(
bool
)
*
nbatch
*
nboxes
);
memcpy
(
tmp_boxes
,
boxes_data
,
sizeof
(
float
)
*
nbatch
*
nboxes
*
5
);
memcpy
(
sc
,
scores_data
,
sizeof
(
float
)
*
nbatch
*
nclass
*
nboxes
);
// std::vector<std::vector<int64_t>> res_order;
std
::
vector
<
int64_t
>
res_order
;
for
(
int64_t
k
=
0
;
k
<
nbatch
;
k
++
)
{
for
(
int64_t
g
=
0
;
g
<
nclass
;
g
++
)
{
for
(
int64_t
i
=
0
;
i
<
nboxes
;
i
++
)
{
select
[
i
]
=
true
;
}
// sort scores
std
::
vector
<
float
>
tmp_sc
;
for
(
int
i
=
0
;
i
<
nboxes
;
i
++
)
{
tmp_sc
.
push_back
(
sc
[
k
*
nboxes
*
nclass
+
g
*
nboxes
+
i
]);
}
std
::
vector
<
int64_t
>
order
(
tmp_sc
.
size
());
std
::
iota
(
order
.
begin
(),
order
.
end
(),
0
);
std
::
sort
(
order
.
begin
(),
order
.
end
(),
[
&
tmp_sc
](
int64_t
id1
,
int64_t
id2
)
{
return
tmp_sc
[
id1
]
>
tmp_sc
[
id2
];
});
for
(
int64_t
_i
=
0
;
_i
<
nboxes
;
_i
++
)
{
if
(
select
[
_i
]
==
false
)
continue
;
auto
i
=
order
[
_i
];
for
(
int64_t
_j
=
_i
+
1
;
_j
<
nboxes
;
_j
++
)
{
if
(
select
[
_j
]
==
false
)
continue
;
auto
j
=
order
[
_j
];
RotatedBox
box1
,
box2
;
auto
center_shift_x
=
(
tmp_boxes
[
k
*
nboxes
*
5
+
i
*
5
]
+
tmp_boxes
[
k
*
nboxes
*
5
+
j
*
5
])
/
2.0
;
auto
center_shift_y
=
(
tmp_boxes
[
k
*
nboxes
*
5
+
i
*
5
+
1
]
+
tmp_boxes
[
k
*
nboxes
*
5
+
j
*
5
+
1
])
/
2.0
;
box1
.
x_ctr
=
tmp_boxes
[
k
*
nboxes
*
5
+
i
*
5
]
-
center_shift_x
;
box1
.
y_ctr
=
tmp_boxes
[
k
*
nboxes
*
5
+
i
*
5
+
1
]
-
center_shift_y
;
box1
.
w
=
tmp_boxes
[
k
*
nboxes
*
5
+
i
*
5
+
2
];
box1
.
h
=
tmp_boxes
[
k
*
nboxes
*
5
+
i
*
5
+
3
];
box1
.
a
=
tmp_boxes
[
k
*
nboxes
*
5
+
i
*
5
+
4
];
box2
.
x_ctr
=
tmp_boxes
[
k
*
nboxes
*
5
+
j
*
5
]
-
center_shift_x
;
box2
.
y_ctr
=
tmp_boxes
[
k
*
nboxes
*
5
+
j
*
5
+
1
]
-
center_shift_y
;
box2
.
w
=
tmp_boxes
[
k
*
nboxes
*
5
+
j
*
5
+
2
];
box2
.
h
=
tmp_boxes
[
k
*
nboxes
*
5
+
j
*
5
+
3
];
box2
.
a
=
tmp_boxes
[
k
*
nboxes
*
5
+
j
*
5
+
4
];
auto
area1
=
box1
.
w
*
box1
.
h
;
auto
area2
=
box2
.
w
*
box2
.
h
;
auto
intersection
=
rotated_boxes_intersection
(
box1
,
box2
);
float
baseS
=
1.0
;
baseS
=
(
area1
+
area2
-
intersection
);
auto
ovr
=
intersection
/
baseS
;
if
(
ovr
>
iou_threshold
)
select
[
_j
]
=
false
;
}
}
for
(
int
i
=
0
;
i
<
nboxes
;
i
++
)
{
if
(
select
[
i
]
&
(
tmp_sc
[
order
[
i
]]
>
score_threshold
))
{
res_order
.
push_back
(
k
);
res_order
.
push_back
(
g
);
res_order
.
push_back
(
order
[
i
]);
}
}
}
// class loop
}
// batch loop
std
::
vector
<
int64_t
>
inds_dims
({(
int64_t
)
res_order
.
size
()
/
3
,
3
});
OrtValue
*
res
=
ort_
.
KernelContext_GetOutput
(
context
,
0
,
inds_dims
.
data
(),
inds_dims
.
size
());
int64_t
*
res_data
=
ort_
.
GetTensorMutableData
<
int64_t
>
(
res
);
memcpy
(
res_data
,
res_order
.
data
(),
sizeof
(
int64_t
)
*
res_order
.
size
());
allocator_
.
Free
(
tmp_boxes
);
allocator_
.
Free
(
sc
);
allocator_
.
Free
(
select
);
}
REGISTER_ONNXRUNTIME_OPS
(
mmdeploy
,
NMSRotatedOp
);
}
// namespace mmdeploy
csrc/mmdeploy/backend_ops/onnxruntime/nms_rotated/nms_rotated.h
0 → 100644
View file @
546b4279
// Copyright (c) OpenMMLab. All rights reserved.
#ifndef ONNXRUNTIME_NMS_ROTATED_H
#define ONNXRUNTIME_NMS_ROTATED_H
#include <assert.h>
#include <onnxruntime_cxx_api.h>
#include <cmath>
#include <mutex>
#include <string>
#include <vector>
namespace
mmdeploy
{
struct
NMSRotatedKernel
{
NMSRotatedKernel
(
const
OrtApi
&
api
,
const
OrtKernelInfo
*
info
);
void
Compute
(
OrtKernelContext
*
context
);
private:
Ort
::
CustomOpApi
ort_
;
const
OrtKernelInfo
*
info_
;
Ort
::
AllocatorWithDefaultOptions
allocator_
;
float
iou_threshold_
;
float
score_threshold_
;
};
struct
NMSRotatedOp
:
Ort
::
CustomOpBase
<
NMSRotatedOp
,
NMSRotatedKernel
>
{
void
*
CreateKernel
(
const
OrtApi
&
api
,
const
OrtKernelInfo
*
info
)
const
{
return
new
NMSRotatedKernel
(
api
,
info
);
}
const
char
*
GetName
()
const
{
return
"NMSRotated"
;
}
size_t
GetInputTypeCount
()
const
{
return
2
;
}
ONNXTensorElementDataType
GetInputType
(
size_t
)
const
{
return
ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT
;
}
size_t
GetOutputTypeCount
()
const
{
return
1
;
}
ONNXTensorElementDataType
GetOutputType
(
size_t
)
const
{
return
ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64
;
}
// force cpu
const
char
*
GetExecutionProviderType
()
const
{
return
"CPUExecutionProvider"
;
}
};
}
// namespace mmdeploy
#endif // ONNXRUNTIME_NMS_ROTATED_H
csrc/mmdeploy/backend_ops/onnxruntime/onnxruntime_register.cpp
0 → 100644
View file @
546b4279
// Copyright (c) OpenMMLab. All rights reserved.
#include "onnxruntime_register.h"
#include "ort_utils.h"
const
char
*
c_MMDeployOpDomain
=
"mmdeploy"
;
OrtStatus
*
ORT_API_CALL
RegisterCustomOps
(
OrtSessionOptions
*
options
,
const
OrtApiBase
*
api
)
{
const
OrtApi
*
kOrtApi
=
api
->
GetApi
(
ORT_API_VERSION
);
OrtStatus
*
status
=
nullptr
;
for
(
auto
&
_op_list_pair
:
mmdeploy
::
get_mmdeploy_custom_ops
())
{
OrtCustomOpDomain
*
domain
=
nullptr
;
if
(
auto
status
=
kOrtApi
->
CreateCustomOpDomain
(
_op_list_pair
.
first
.
c_str
(),
&
domain
))
{
return
status
;
}
auto
&
_op_list
=
_op_list_pair
.
second
;
for
(
auto
&
_op
:
_op_list
)
{
if
(
auto
status
=
kOrtApi
->
CustomOpDomain_Add
(
domain
,
_op
))
{
return
status
;
}
}
// TODO: figure out what will return if failed.
status
=
kOrtApi
->
AddCustomOpDomain
(
options
,
domain
);
}
return
status
;
}
csrc/mmdeploy/backend_ops/onnxruntime/roi_align_rotated/roi_align_rotated.cpp
0 → 100644
View file @
546b4279
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
// Modified from
// https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/ROIAlignRotated
#include "roi_align_rotated.h"
#include "ort_utils.h"
namespace
mmdeploy
{
// implementation taken from Caffe2
struct
PreCalc
{
int
pos1
;
int
pos2
;
int
pos3
;
int
pos4
;
float
w1
;
float
w2
;
float
w3
;
float
w4
;
};
void
pre_calc_for_bilinear_interpolate
(
const
int
height
,
const
int
width
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
iy_upper
,
const
int
ix_upper
,
float
roi_start_h
,
float
roi_start_w
,
float
bin_size_h
,
float
bin_size_w
,
int
roi_bin_grid_h
,
int
roi_bin_grid_w
,
float
roi_center_h
,
float
roi_center_w
,
float
cos_theta
,
float
sin_theta
,
std
::
vector
<
PreCalc
>
&
pre_calc
)
{
int
pre_calc_index
=
0
;
for
(
int
ph
=
0
;
ph
<
pooled_height
;
ph
++
)
{
for
(
int
pw
=
0
;
pw
<
pooled_width
;
pw
++
)
{
for
(
int
iy
=
0
;
iy
<
iy_upper
;
iy
++
)
{
const
float
yy
=
roi_start_h
+
ph
*
bin_size_h
+
static_cast
<
float
>
(
iy
+
.5
f
)
*
bin_size_h
/
static_cast
<
float
>
(
roi_bin_grid_h
);
// e.g., 0.5, 1.5
for
(
int
ix
=
0
;
ix
<
ix_upper
;
ix
++
)
{
const
float
xx
=
roi_start_w
+
pw
*
bin_size_w
+
static_cast
<
float
>
(
ix
+
.5
f
)
*
bin_size_w
/
static_cast
<
float
>
(
roi_bin_grid_w
);
// Rotate by theta around the center and translate
// In image space, (y, x) is the order for Right Handed System,
// and this is essentially multiplying the point by a rotation matrix
// to rotate it counterclockwise through angle theta.
float
y
=
yy
*
cos_theta
-
xx
*
sin_theta
+
roi_center_h
;
float
x
=
yy
*
sin_theta
+
xx
*
cos_theta
+
roi_center_w
;
// deal with: inverse elements are out of feature map boundary
if
(
y
<
-
1.0
||
y
>
height
||
x
<
-
1.0
||
x
>
width
)
{
// empty
PreCalc
pc
;
pc
.
pos1
=
0
;
pc
.
pos2
=
0
;
pc
.
pos3
=
0
;
pc
.
pos4
=
0
;
pc
.
w1
=
0
;
pc
.
w2
=
0
;
pc
.
w3
=
0
;
pc
.
w4
=
0
;
pre_calc
[
pre_calc_index
]
=
pc
;
pre_calc_index
+=
1
;
continue
;
}
if
(
y
<
0
)
{
y
=
0
;
}
if
(
x
<
0
)
{
x
=
0
;
}
int
y_low
=
(
int
)
y
;
int
x_low
=
(
int
)
x
;
int
y_high
;
int
x_high
;
if
(
y_low
>=
height
-
1
)
{
y_high
=
y_low
=
height
-
1
;
y
=
(
float
)
y_low
;
}
else
{
y_high
=
y_low
+
1
;
}
if
(
x_low
>=
width
-
1
)
{
x_high
=
x_low
=
width
-
1
;
x
=
(
float
)
x_low
;
}
else
{
x_high
=
x_low
+
1
;
}
float
ly
=
y
-
y_low
;
float
lx
=
x
-
x_low
;
float
hy
=
1.
-
ly
,
hx
=
1.
-
lx
;
float
w1
=
hy
*
hx
,
w2
=
hy
*
lx
,
w3
=
ly
*
hx
,
w4
=
ly
*
lx
;
// save weights and indices
PreCalc
pc
;
pc
.
pos1
=
y_low
*
width
+
x_low
;
pc
.
pos2
=
y_low
*
width
+
x_high
;
pc
.
pos3
=
y_high
*
width
+
x_low
;
pc
.
pos4
=
y_high
*
width
+
x_high
;
pc
.
w1
=
w1
;
pc
.
w2
=
w2
;
pc
.
w3
=
w3
;
pc
.
w4
=
w4
;
pre_calc
[
pre_calc_index
]
=
pc
;
pre_calc_index
+=
1
;
}
}
}
}
}
void
ROIAlignRotatedForwardCPU
(
const
int
nthreads
,
const
float
*
input
,
const
float
*
rois
,
float
*
output
,
const
float
&
spatial_scale
,
const
int
aligned
,
const
int
clockwise
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
sampling_ratio
)
{
int
n_rois
=
nthreads
/
channels
/
pooled_width
/
pooled_height
;
// (n, c, ph, pw) is an element in the pooled output
// can be parallelized using omp
// #pragma omp parallel for num_threads(32)
for
(
int
n
=
0
;
n
<
n_rois
;
n
++
)
{
int
index_n
=
n
*
channels
*
pooled_width
*
pooled_height
;
const
float
*
current_roi
=
rois
+
n
*
6
;
int
roi_batch_ind
=
current_roi
[
0
];
// Do not use rounding; this implementation detail is critical
float
offset
=
aligned
?
(
float
)
0.5
:
(
float
)
0.0
;
float
roi_center_w
=
current_roi
[
1
]
*
spatial_scale
-
offset
;
float
roi_center_h
=
current_roi
[
2
]
*
spatial_scale
-
offset
;
float
roi_width
=
current_roi
[
3
]
*
spatial_scale
;
float
roi_height
=
current_roi
[
4
]
*
spatial_scale
;
// float theta = current_roi[5] * M_PI / 180.0;
float
theta
=
current_roi
[
5
];
// Radian angle by default
if
(
clockwise
)
{
theta
=
-
theta
;
}
float
cos_theta
=
cos
(
theta
);
float
sin_theta
=
sin
(
theta
);
if
(
!
aligned
)
{
// for backward-compatibility only
roi_width
=
std
::
max
(
roi_width
,
(
float
)
1.
);
roi_height
=
std
::
max
(
roi_height
,
(
float
)
1.
);
}
float
bin_size_h
=
static_cast
<
float
>
(
roi_height
)
/
static_cast
<
float
>
(
pooled_height
);
float
bin_size_w
=
static_cast
<
float
>
(
roi_width
)
/
static_cast
<
float
>
(
pooled_width
);
// We use roi_bin_grid to sample the grid and mimic integral
int
roi_bin_grid_h
=
(
sampling_ratio
>
0
)
?
sampling_ratio
:
ceil
(
roi_height
/
pooled_height
);
// e.g., = 2
int
roi_bin_grid_w
=
(
sampling_ratio
>
0
)
?
sampling_ratio
:
ceil
(
roi_width
/
pooled_width
);
// We do average (integral) pooling inside a bin
const
float
count
=
std
::
max
(
roi_bin_grid_h
*
roi_bin_grid_w
,
1
);
// e.g. = 4
// we want to precalculate indices and weights shared by all channels,
// this is the key point of optimization
std
::
vector
<
PreCalc
>
pre_calc
(
roi_bin_grid_h
*
roi_bin_grid_w
*
pooled_width
*
pooled_height
);
// roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
// Appropriate translation needs to be applied after.
float
roi_start_h
=
-
roi_height
/
2.0
;
float
roi_start_w
=
-
roi_width
/
2.0
;
pre_calc_for_bilinear_interpolate
(
height
,
width
,
pooled_height
,
pooled_width
,
roi_bin_grid_h
,
roi_bin_grid_w
,
roi_start_h
,
roi_start_w
,
bin_size_h
,
bin_size_w
,
roi_bin_grid_h
,
roi_bin_grid_w
,
roi_center_h
,
roi_center_w
,
cos_theta
,
sin_theta
,
pre_calc
);
for
(
int
c
=
0
;
c
<
channels
;
c
++
)
{
int
index_n_c
=
index_n
+
c
*
pooled_width
*
pooled_height
;
const
float
*
offset_input
=
input
+
(
roi_batch_ind
*
channels
+
c
)
*
height
*
width
;
int
pre_calc_index
=
0
;
for
(
int
ph
=
0
;
ph
<
pooled_height
;
ph
++
)
{
for
(
int
pw
=
0
;
pw
<
pooled_width
;
pw
++
)
{
int
index
=
index_n_c
+
ph
*
pooled_width
+
pw
;
float
output_val
=
0.
;
for
(
int
iy
=
0
;
iy
<
roi_bin_grid_h
;
iy
++
)
{
for
(
int
ix
=
0
;
ix
<
roi_bin_grid_w
;
ix
++
)
{
PreCalc
pc
=
pre_calc
[
pre_calc_index
];
output_val
+=
pc
.
w1
*
offset_input
[
pc
.
pos1
]
+
pc
.
w2
*
offset_input
[
pc
.
pos2
]
+
pc
.
w3
*
offset_input
[
pc
.
pos3
]
+
pc
.
w4
*
offset_input
[
pc
.
pos4
];
pre_calc_index
+=
1
;
}
}
output_val
/=
count
;
output
[
index
]
=
output_val
;
}
// for pw
}
// for ph
}
// for c
}
// for n
}
void
MMCVRoIAlignRotatedKernel
::
Compute
(
OrtKernelContext
*
context
)
{
// Setup inputs
const
OrtValue
*
input_X
=
ort_
.
KernelContext_GetInput
(
context
,
0
);
const
float
*
X_data
=
reinterpret_cast
<
const
float
*>
(
ort_
.
GetTensorData
<
float
>
(
input_X
));
const
OrtValue
*
input_rois
=
ort_
.
KernelContext_GetInput
(
context
,
1
);
const
float
*
rois
=
reinterpret_cast
<
const
float
*>
(
ort_
.
GetTensorData
<
const
float
*>
(
input_rois
));
// Setup output
OrtTensorDimensions
out_dimensions
(
ort_
,
input_X
);
OrtTensorDimensions
roi_dimensions
(
ort_
,
input_rois
);
int
batch_size
=
out_dimensions
.
data
()[
0
];
int
input_channels
=
out_dimensions
.
data
()[
1
];
int
input_height
=
out_dimensions
.
data
()[
2
];
int
input_width
=
out_dimensions
.
data
()[
3
];
out_dimensions
.
data
()[
0
]
=
roi_dimensions
.
data
()[
0
];
out_dimensions
.
data
()[
2
]
=
aligned_height_
;
out_dimensions
.
data
()[
3
]
=
aligned_width_
;
OrtValue
*
output
=
ort_
.
KernelContext_GetOutput
(
context
,
0
,
out_dimensions
.
data
(),
out_dimensions
.
size
());
float
*
out
=
ort_
.
GetTensorMutableData
<
float
>
(
output
);
OrtTensorTypeAndShapeInfo
*
output_info
=
ort_
.
GetTensorTypeAndShape
(
output
);
ort_
.
ReleaseTensorTypeAndShapeInfo
(
output_info
);
// TODO: forward here
int
output_size
=
out_dimensions
.
data
()[
0
];
for
(
auto
i
=
1
;
i
<
out_dimensions
.
size
();
++
i
)
{
output_size
*=
out_dimensions
.
data
()[
i
];
}
ROIAlignRotatedForwardCPU
(
output_size
,
X_data
,
rois
,
out
,
spatial_scale_
,
aligned_
,
clockwise_
,
input_channels
,
input_height
,
input_width
,
aligned_height_
,
aligned_width_
,
sampling_ratio_
);
}
REGISTER_ONNXRUNTIME_OPS
(
mmdeploy
,
MMCVRoIAlignRotatedCustomOp
);
}
// namespace mmdeploy
csrc/mmdeploy/backend_ops/onnxruntime/roi_align_rotated/roi_align_rotated.h
0 → 100644
View file @
546b4279
// Copyright (c) OpenMMLab. All rights reserved
#ifndef ONNXRUNTIME_ROI_ALIGN_ROTATED_H
#define ONNXRUNTIME_ROI_ALIGN_ROTATED_H
#include <assert.h>
#include <onnxruntime_cxx_api.h>
#include <cmath>
#include <mutex>
#include <string>
#include <vector>
namespace
mmdeploy
{
struct
MMCVRoIAlignRotatedKernel
{
public:
MMCVRoIAlignRotatedKernel
(
Ort
::
CustomOpApi
ort
,
const
OrtKernelInfo
*
info
)
:
ort_
(
ort
)
{
aligned_height_
=
ort_
.
KernelInfoGetAttribute
<
int64_t
>
(
info
,
"output_height"
);
aligned_width_
=
ort_
.
KernelInfoGetAttribute
<
int64_t
>
(
info
,
"output_width"
);
sampling_ratio_
=
ort_
.
KernelInfoGetAttribute
<
int64_t
>
(
info
,
"sampling_ratio"
);
spatial_scale_
=
ort_
.
KernelInfoGetAttribute
<
float
>
(
info
,
"spatial_scale"
);
aligned_
=
ort_
.
KernelInfoGetAttribute
<
int64_t
>
(
info
,
"aligned"
);
clockwise_
=
ort_
.
KernelInfoGetAttribute
<
int64_t
>
(
info
,
"clockwise"
);
}
void
Compute
(
OrtKernelContext
*
context
);
private:
Ort
::
CustomOpApi
ort_
;
int
aligned_height_
;
int
aligned_width_
;
float
spatial_scale_
;
int
sampling_ratio_
;
int
aligned_
;
int
clockwise_
;
};
struct
MMCVRoIAlignRotatedCustomOp
:
Ort
::
CustomOpBase
<
MMCVRoIAlignRotatedCustomOp
,
MMCVRoIAlignRotatedKernel
>
{
void
*
CreateKernel
(
Ort
::
CustomOpApi
api
,
const
OrtKernelInfo
*
info
)
const
{
return
new
MMCVRoIAlignRotatedKernel
(
api
,
info
);
}
const
char
*
GetName
()
const
{
return
"MMCVRoIAlignRotated"
;
}
size_t
GetInputTypeCount
()
const
{
return
2
;
}
ONNXTensorElementDataType
GetInputType
(
size_t
)
const
{
return
ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT
;
}
size_t
GetOutputTypeCount
()
const
{
return
1
;
}
ONNXTensorElementDataType
GetOutputType
(
size_t
)
const
{
return
ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT
;
}
// force cpu
const
char
*
GetExecutionProviderType
()
const
{
return
"CPUExecutionProvider"
;
}
};
}
// namespace mmdeploy
#endif // ONNXRUNTIME_ROI_ALIGN_ROTATED_H
csrc/mmdeploy/backend_ops/tensorrt/CMakeLists.txt
0 → 100644
View file @
546b4279
# Copyright (c) OpenMMLab. All rights reserved.
project
(
mmdeploy_tensorrt_ops
)
include
(
${
CMAKE_SOURCE_DIR
}
/cmake/tensorrt.cmake
)
# cub
if
(
NOT DEFINED CUB_ROOT_DIR
)
if
(
CUDA_VERSION VERSION_LESS 11.0
)
set
(
CUB_ROOT_DIR
"
${
CMAKE_SOURCE_DIR
}
/third_party/cub"
)
endif
()
endif
()
file
(
GLOB_RECURSE BACKEND_OPS_SRCS *.cpp *.cu
)
add_library
(
${
PROJECT_NAME
}
_obj OBJECT
"
${
BACKEND_OPS_SRCS
}
"
)
set_target_properties
(
${
PROJECT_NAME
}
_obj PROPERTIES POSITION_INDEPENDENT_CODE 1
)
target_compile_definitions
(
${
PROJECT_NAME
}
_obj
PRIVATE -DTHRUST_IGNORE_DEPRECATED_CPP_DIALECT=1
)
target_include_directories
(
${
PROJECT_NAME
}
_obj
PRIVATE
${
CMAKE_CURRENT_SOURCE_DIR
}
/../common
)
target_include_directories
(
${
PROJECT_NAME
}
_obj
PRIVATE
${
CMAKE_CURRENT_SOURCE_DIR
}
/common
)
target_include_directories
(
${
PROJECT_NAME
}
_obj
PRIVATE
${
CUDA_TOOLKIT_ROOT_DIR
}
/include
)
target_include_directories
(
${
PROJECT_NAME
}
_obj PRIVATE
${
TENSORRT_INCLUDE_DIR
}
)
target_include_directories
(
${
PROJECT_NAME
}
_obj PRIVATE
${
CUDNN_DIR
}
/include
)
target_include_directories
(
${
PROJECT_NAME
}
_obj PRIVATE
${
CUB_ROOT_DIR
}
)
target_link_libraries
(
${
PROJECT_NAME
}
_obj
PUBLIC
${
TENSORRT_LIBS
}
cublas cudnn
)
mmdeploy_export
(
${
PROJECT_NAME
}
_obj
)
# Build module library. It is used to convert onnx model to tensorrt engine
mmdeploy_add_module
(
${
PROJECT_NAME
}
MODULE EXCLUDE
""
)
target_link_libraries
(
${
PROJECT_NAME
}
PRIVATE
${
PROJECT_NAME
}
_obj
)
add_library
(
mmdeploy::tensorrt_ops ALIAS
${
PROJECT_NAME
}
)
set
(
_TRT_OPS_DIR
${
CMAKE_SOURCE_DIR
}
/mmdeploy/lib
)
install
(
TARGETS
${
PROJECT_NAME
}
DESTINATION
${
_TRT_OPS_DIR
}
)
csrc/mmdeploy/backend_ops/tensorrt/batched_nms/trt_batched_nms.cpp
0 → 100644
View file @
546b4279
// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
// modify from
// https://github.com/NVIDIA/TensorRT/tree/master/plugin/batchedNMSPlugin
#include "trt_batched_nms.hpp"
#include <cstring>
#include "nms/batched_nms_kernel.hpp"
#include "nms/kernel.h"
#include "trt_serialize.hpp"
namespace
mmdeploy
{
using
namespace
nvinfer1
;
using
nvinfer1
::
plugin
::
NMSParameters
;
namespace
{
static
const
char
*
NMS_PLUGIN_VERSION
{
"1"
};
static
const
char
*
NMS_PLUGIN_NAME
{
"TRTBatchedNMS"
};
}
// namespace
TRTBatchedNMS
::
TRTBatchedNMS
(
const
std
::
string
&
name
,
NMSParameters
params
,
bool
returnIndex
)
:
TRTPluginBase
(
name
),
param
(
params
),
mReturnIndex
(
returnIndex
)
{}
TRTBatchedNMS
::
TRTBatchedNMS
(
const
std
::
string
&
name
,
const
void
*
data
,
size_t
length
)
:
TRTPluginBase
(
name
)
{
deserialize_value
(
&
data
,
&
length
,
&
param
);
deserialize_value
(
&
data
,
&
length
,
&
mClipBoxes
);
deserialize_value
(
&
data
,
&
length
,
&
mReturnIndex
);
}
int
TRTBatchedNMS
::
getNbOutputs
()
const
TRT_NOEXCEPT
{
int
num
=
mReturnIndex
?
3
:
2
;
return
num
;
}
nvinfer1
::
DimsExprs
TRTBatchedNMS
::
getOutputDimensions
(
int
outputIndex
,
const
nvinfer1
::
DimsExprs
*
inputs
,
int
nbInputs
,
nvinfer1
::
IExprBuilder
&
exprBuilder
)
TRT_NOEXCEPT
{
ASSERT
(
nbInputs
==
2
);
ASSERT
(
outputIndex
>=
0
&&
outputIndex
<
this
->
getNbOutputs
());
ASSERT
(
inputs
[
0
].
nbDims
==
4
);
ASSERT
(
inputs
[
1
].
nbDims
==
3
);
nvinfer1
::
DimsExprs
ret
;
ret
.
d
[
0
]
=
inputs
[
0
].
d
[
0
];
ret
.
d
[
1
]
=
exprBuilder
.
constant
(
param
.
keepTopK
);
switch
(
outputIndex
)
{
case
0
:
ret
.
nbDims
=
3
;
ret
.
d
[
2
]
=
exprBuilder
.
constant
(
5
);
break
;
case
1
:
ret
.
nbDims
=
2
;
break
;
case
2
:
ret
.
nbDims
=
2
;
default:
break
;
}
return
ret
;
}
size_t
TRTBatchedNMS
::
getWorkspaceSize
(
const
nvinfer1
::
PluginTensorDesc
*
inputs
,
int
nbInputs
,
const
nvinfer1
::
PluginTensorDesc
*
outputs
,
int
nbOutputs
)
const
TRT_NOEXCEPT
{
size_t
batch_size
=
inputs
[
0
].
dims
.
d
[
0
];
size_t
boxes_size
=
inputs
[
0
].
dims
.
d
[
1
]
*
inputs
[
0
].
dims
.
d
[
2
]
*
inputs
[
0
].
dims
.
d
[
3
];
size_t
score_size
=
inputs
[
1
].
dims
.
d
[
1
]
*
inputs
[
1
].
dims
.
d
[
2
];
size_t
num_priors
=
inputs
[
0
].
dims
.
d
[
1
];
bool
shareLocation
=
(
inputs
[
0
].
dims
.
d
[
2
]
==
1
);
int
topk
=
param
.
topK
>
0
&&
param
.
topK
<=
inputs
[
1
].
dims
.
d
[
1
]
?
param
.
topK
:
inputs
[
1
].
dims
.
d
[
1
];
return
detectionInferenceWorkspaceSize
(
shareLocation
,
batch_size
,
boxes_size
,
score_size
,
param
.
numClasses
,
num_priors
,
topk
,
DataType
::
kFLOAT
,
DataType
::
kFLOAT
);
}
int
TRTBatchedNMS
::
enqueue
(
const
nvinfer1
::
PluginTensorDesc
*
inputDesc
,
const
nvinfer1
::
PluginTensorDesc
*
outputDesc
,
const
void
*
const
*
inputs
,
void
*
const
*
outputs
,
void
*
workSpace
,
cudaStream_t
stream
)
TRT_NOEXCEPT
{
const
void
*
const
locData
=
inputs
[
0
];
const
void
*
const
confData
=
inputs
[
1
];
void
*
nmsedDets
=
outputs
[
0
];
void
*
nmsedLabels
=
outputs
[
1
];
void
*
nmsedIndex
=
mReturnIndex
?
outputs
[
2
]
:
nullptr
;
size_t
batch_size
=
inputDesc
[
0
].
dims
.
d
[
0
];
size_t
boxes_size
=
inputDesc
[
0
].
dims
.
d
[
1
]
*
inputDesc
[
0
].
dims
.
d
[
2
]
*
inputDesc
[
0
].
dims
.
d
[
3
];
size_t
score_size
=
inputDesc
[
1
].
dims
.
d
[
1
]
*
inputDesc
[
1
].
dims
.
d
[
2
];
size_t
num_priors
=
inputDesc
[
0
].
dims
.
d
[
1
];
bool
shareLocation
=
(
inputDesc
[
0
].
dims
.
d
[
2
]
==
1
);
int
topk
=
param
.
topK
>
0
&&
param
.
topK
<=
inputDesc
[
1
].
dims
.
d
[
1
]
?
param
.
topK
:
inputDesc
[
1
].
dims
.
d
[
1
];
bool
rotated
=
false
;
pluginStatus_t
status
=
nmsInference
(
stream
,
batch_size
,
boxes_size
,
score_size
,
shareLocation
,
param
.
backgroundLabelId
,
num_priors
,
param
.
numClasses
,
topk
,
param
.
keepTopK
,
param
.
scoreThreshold
,
param
.
iouThreshold
,
DataType
::
kFLOAT
,
locData
,
DataType
::
kFLOAT
,
confData
,
nmsedDets
,
nmsedLabels
,
nmsedIndex
,
workSpace
,
param
.
isNormalized
,
false
,
mClipBoxes
,
rotated
);
ASSERT
(
status
==
STATUS_SUCCESS
);
return
0
;
}
size_t
TRTBatchedNMS
::
getSerializationSize
()
const
TRT_NOEXCEPT
{
// NMSParameters
return
sizeof
(
NMSParameters
)
+
sizeof
(
mClipBoxes
)
+
sizeof
(
mReturnIndex
);
}
void
TRTBatchedNMS
::
serialize
(
void
*
buffer
)
const
TRT_NOEXCEPT
{
serialize_value
(
&
buffer
,
param
);
serialize_value
(
&
buffer
,
mClipBoxes
);
serialize_value
(
&
buffer
,
mReturnIndex
);
}
void
TRTBatchedNMS
::
configurePlugin
(
const
nvinfer1
::
DynamicPluginTensorDesc
*
inputs
,
int
nbInputs
,
const
nvinfer1
::
DynamicPluginTensorDesc
*
outputs
,
int
nbOutputs
)
TRT_NOEXCEPT
{
// Validate input arguments
}
bool
TRTBatchedNMS
::
supportsFormatCombination
(
int
pos
,
const
nvinfer1
::
PluginTensorDesc
*
ioDesc
,
int
nbInputs
,
int
nbOutputs
)
TRT_NOEXCEPT
{
if
(
pos
==
3
||
pos
==
4
)
{
return
ioDesc
[
pos
].
type
==
nvinfer1
::
DataType
::
kINT32
&&
ioDesc
[
pos
].
format
==
nvinfer1
::
TensorFormat
::
kLINEAR
;
}
return
ioDesc
[
pos
].
type
==
nvinfer1
::
DataType
::
kFLOAT
&&
ioDesc
[
pos
].
format
==
nvinfer1
::
TensorFormat
::
kLINEAR
;
}
const
char
*
TRTBatchedNMS
::
getPluginType
()
const
TRT_NOEXCEPT
{
return
NMS_PLUGIN_NAME
;
}
const
char
*
TRTBatchedNMS
::
getPluginVersion
()
const
TRT_NOEXCEPT
{
return
NMS_PLUGIN_VERSION
;
}
IPluginV2DynamicExt
*
TRTBatchedNMS
::
clone
()
const
TRT_NOEXCEPT
{
auto
*
plugin
=
new
TRTBatchedNMS
(
mLayerName
,
param
,
mReturnIndex
);
plugin
->
setPluginNamespace
(
mNamespace
.
c_str
());
plugin
->
setClipParam
(
mClipBoxes
);
return
plugin
;
}
nvinfer1
::
DataType
TRTBatchedNMS
::
getOutputDataType
(
int
index
,
const
nvinfer1
::
DataType
*
inputTypes
,
int
nbInputs
)
const
TRT_NOEXCEPT
{
ASSERT
(
index
>=
0
&&
index
<
this
->
getNbOutputs
());
if
(
index
==
1
||
index
==
2
)
{
return
nvinfer1
::
DataType
::
kINT32
;
}
return
inputTypes
[
0
];
}
void
TRTBatchedNMS
::
setClipParam
(
bool
clip
)
{
mClipBoxes
=
clip
;
}
TRTBatchedNMSCreator
::
TRTBatchedNMSCreator
()
{
mPluginAttributes
.
emplace_back
(
PluginField
(
"background_label_id"
,
nullptr
,
PluginFieldType
::
kINT32
,
1
));
mPluginAttributes
.
emplace_back
(
PluginField
(
"num_classes"
,
nullptr
,
PluginFieldType
::
kINT32
,
1
));
mPluginAttributes
.
emplace_back
(
PluginField
(
"topk"
,
nullptr
,
PluginFieldType
::
kINT32
,
1
));
mPluginAttributes
.
emplace_back
(
PluginField
(
"keep_topk"
,
nullptr
,
PluginFieldType
::
kINT32
,
1
));
mPluginAttributes
.
emplace_back
(
PluginField
(
"score_threshold"
,
nullptr
,
PluginFieldType
::
kFLOAT32
,
1
));
mPluginAttributes
.
emplace_back
(
PluginField
(
"iou_threshold"
,
nullptr
,
PluginFieldType
::
kFLOAT32
,
1
));
mPluginAttributes
.
emplace_back
(
PluginField
(
"is_normalized"
,
nullptr
,
PluginFieldType
::
kINT32
,
1
));
mPluginAttributes
.
emplace_back
(
PluginField
(
"clip_boxes"
,
nullptr
,
PluginFieldType
::
kINT32
,
1
));
mPluginAttributes
.
emplace_back
(
PluginField
(
"return_index"
,
nullptr
,
PluginFieldType
::
kINT32
,
1
));
mFC
.
nbFields
=
mPluginAttributes
.
size
();
mFC
.
fields
=
mPluginAttributes
.
data
();
}
const
char
*
TRTBatchedNMSCreator
::
getPluginName
()
const
TRT_NOEXCEPT
{
return
NMS_PLUGIN_NAME
;
}
const
char
*
TRTBatchedNMSCreator
::
getPluginVersion
()
const
TRT_NOEXCEPT
{
return
NMS_PLUGIN_VERSION
;
}
IPluginV2Ext
*
TRTBatchedNMSCreator
::
createPlugin
(
const
char
*
name
,
const
PluginFieldCollection
*
fc
)
TRT_NOEXCEPT
{
const
PluginField
*
fields
=
fc
->
fields
;
bool
clipBoxes
=
true
;
bool
returnIndex
=
false
;
nvinfer1
::
plugin
::
NMSParameters
params
{};
for
(
int
i
=
0
;
i
<
fc
->
nbFields
;
++
i
)
{
const
char
*
attrName
=
fields
[
i
].
name
;
if
(
!
strcmp
(
attrName
,
"background_label_id"
))
{
ASSERT
(
fields
[
i
].
type
==
PluginFieldType
::
kINT32
);
params
.
backgroundLabelId
=
*
(
static_cast
<
const
int
*>
(
fields
[
i
].
data
));
}
else
if
(
!
strcmp
(
attrName
,
"num_classes"
))
{
ASSERT
(
fields
[
i
].
type
==
PluginFieldType
::
kINT32
);
params
.
numClasses
=
*
(
static_cast
<
const
int
*>
(
fields
[
i
].
data
));
}
else
if
(
!
strcmp
(
attrName
,
"topk"
))
{
ASSERT
(
fields
[
i
].
type
==
PluginFieldType
::
kINT32
);
params
.
topK
=
*
(
static_cast
<
const
int
*>
(
fields
[
i
].
data
));
}
else
if
(
!
strcmp
(
attrName
,
"keep_topk"
))
{
ASSERT
(
fields
[
i
].
type
==
PluginFieldType
::
kINT32
);
params
.
keepTopK
=
*
(
static_cast
<
const
int
*>
(
fields
[
i
].
data
));
}
else
if
(
!
strcmp
(
attrName
,
"score_threshold"
))
{
ASSERT
(
fields
[
i
].
type
==
PluginFieldType
::
kFLOAT32
);
params
.
scoreThreshold
=
*
(
static_cast
<
const
float
*>
(
fields
[
i
].
data
));
}
else
if
(
!
strcmp
(
attrName
,
"iou_threshold"
))
{
ASSERT
(
fields
[
i
].
type
==
PluginFieldType
::
kFLOAT32
);
params
.
iouThreshold
=
*
(
static_cast
<
const
float
*>
(
fields
[
i
].
data
));
}
else
if
(
!
strcmp
(
attrName
,
"is_normalized"
))
{
params
.
isNormalized
=
*
(
static_cast
<
const
bool
*>
(
fields
[
i
].
data
));
}
else
if
(
!
strcmp
(
attrName
,
"clip_boxes"
))
{
clipBoxes
=
*
(
static_cast
<
const
bool
*>
(
fields
[
i
].
data
));
}
else
if
(
!
strcmp
(
attrName
,
"return_index"
))
{
returnIndex
=
*
(
static_cast
<
const
bool
*>
(
fields
[
i
].
data
));
}
}
TRTBatchedNMS
*
plugin
=
new
TRTBatchedNMS
(
name
,
params
,
returnIndex
);
plugin
->
setClipParam
(
clipBoxes
);
plugin
->
setPluginNamespace
(
mNamespace
.
c_str
());
return
plugin
;
}
IPluginV2Ext
*
TRTBatchedNMSCreator
::
deserializePlugin
(
const
char
*
name
,
const
void
*
serialData
,
size_t
serialLength
)
TRT_NOEXCEPT
{
// This object will be deleted when the network is destroyed, which will
// call NMS::destroy()
TRTBatchedNMS
*
plugin
=
new
TRTBatchedNMS
(
name
,
serialData
,
serialLength
);
plugin
->
setPluginNamespace
(
mNamespace
.
c_str
());
return
plugin
;
}
REGISTER_TENSORRT_PLUGIN
(
TRTBatchedNMSCreator
);
}
// namespace mmdeploy
csrc/mmdeploy/backend_ops/tensorrt/batched_nms/trt_batched_nms.hpp
0 → 100644
View file @
546b4279
// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
// modify from
// https://github.com/NVIDIA/TensorRT/tree/master/plugin/batchedNMSPlugin
#ifndef TRT_BATCHED_NMS_PLUGIN_CUSTOM_H
#define TRT_BATCHED_NMS_PLUGIN_CUSTOM_H
#include <string>
#include <vector>
#include "NvInferPluginUtils.h"
#include "trt_plugin_base.hpp"
namespace
mmdeploy
{
enum
NMSReturnType
{
RETURN_DETS
=
1
,
RETURN_INDEX
=
1
<<
1
};
class
TRTBatchedNMS
:
public
TRTPluginBase
{
public:
TRTBatchedNMS
(
const
std
::
string
&
name
,
nvinfer1
::
plugin
::
NMSParameters
param
,
bool
returnIndex
);
TRTBatchedNMS
(
const
std
::
string
&
name
,
const
void
*
data
,
size_t
length
);
~
TRTBatchedNMS
()
TRT_NOEXCEPT
override
=
default
;
int
getNbOutputs
()
const
TRT_NOEXCEPT
override
;
nvinfer1
::
DimsExprs
getOutputDimensions
(
int
outputIndex
,
const
nvinfer1
::
DimsExprs
*
inputs
,
int
nbInputs
,
nvinfer1
::
IExprBuilder
&
exprBuilder
)
TRT_NOEXCEPT
override
;
size_t
getWorkspaceSize
(
const
nvinfer1
::
PluginTensorDesc
*
inputs
,
int
nbInputs
,
const
nvinfer1
::
PluginTensorDesc
*
outputs
,
int
nbOutputs
)
const
TRT_NOEXCEPT
override
;
int
enqueue
(
const
nvinfer1
::
PluginTensorDesc
*
inputDesc
,
const
nvinfer1
::
PluginTensorDesc
*
outputDesc
,
const
void
*
const
*
inputs
,
void
*
const
*
outputs
,
void
*
workSpace
,
cudaStream_t
stream
)
TRT_NOEXCEPT
override
;
size_t
getSerializationSize
()
const
TRT_NOEXCEPT
override
;
void
serialize
(
void
*
buffer
)
const
TRT_NOEXCEPT
override
;
void
configurePlugin
(
const
nvinfer1
::
DynamicPluginTensorDesc
*
inputs
,
int
nbInputs
,
const
nvinfer1
::
DynamicPluginTensorDesc
*
outputs
,
int
nbOutputs
)
TRT_NOEXCEPT
override
;
bool
supportsFormatCombination
(
int
pos
,
const
nvinfer1
::
PluginTensorDesc
*
ioDesc
,
int
nbInputs
,
int
nbOutputs
)
TRT_NOEXCEPT
override
;
const
char
*
getPluginType
()
const
TRT_NOEXCEPT
override
;
const
char
*
getPluginVersion
()
const
TRT_NOEXCEPT
override
;
nvinfer1
::
IPluginV2DynamicExt
*
clone
()
const
TRT_NOEXCEPT
override
;
nvinfer1
::
DataType
getOutputDataType
(
int
index
,
const
nvinfer1
::
DataType
*
inputType
,
int
nbInputs
)
const
TRT_NOEXCEPT
override
;
void
setClipParam
(
bool
clip
);
private:
nvinfer1
::
plugin
::
NMSParameters
param
{};
bool
mClipBoxes
{};
bool
mReturnIndex
{};
};
class
TRTBatchedNMSCreator
:
public
TRTPluginCreatorBase
{
public:
TRTBatchedNMSCreator
();
~
TRTBatchedNMSCreator
()
TRT_NOEXCEPT
override
=
default
;
const
char
*
getPluginName
()
const
TRT_NOEXCEPT
override
;
const
char
*
getPluginVersion
()
const
TRT_NOEXCEPT
override
;
nvinfer1
::
IPluginV2Ext
*
createPlugin
(
const
char
*
name
,
const
nvinfer1
::
PluginFieldCollection
*
fc
)
TRT_NOEXCEPT
override
;
nvinfer1
::
IPluginV2Ext
*
deserializePlugin
(
const
char
*
name
,
const
void
*
serialData
,
size_t
serialLength
)
TRT_NOEXCEPT
override
;
};
}
// namespace mmdeploy
#endif // TRT_BATCHED_NMS_PLUGIN_CUSTOM_H
csrc/mmdeploy/backend_ops/tensorrt/batched_rotated_nms/trt_batched_rotated_nms.cpp
0 → 100644
View file @
546b4279
// Copyright (c) OpenMMLab. All rights reserved
#include "trt_batched_rotated_nms.hpp"
#include <cstring>
#include "nms/batched_nms_kernel.hpp"
#include "nms/kernel.h"
#include "trt_serialize.hpp"
namespace
mmdeploy
{
using
namespace
nvinfer1
;
using
nvinfer1
::
plugin
::
NMSParameters
;
namespace
{
static
const
char
*
NMS_PLUGIN_VERSION
{
"1"
};
static
const
char
*
NMS_PLUGIN_NAME
{
"TRTBatchedRotatedNMS"
};
}
// namespace
TRTBatchedRotatedNMS
::
TRTBatchedRotatedNMS
(
const
std
::
string
&
name
,
NMSParameters
params
)
:
TRTPluginBase
(
name
),
param
(
params
)
{}
TRTBatchedRotatedNMS
::
TRTBatchedRotatedNMS
(
const
std
::
string
&
name
,
const
void
*
data
,
size_t
length
)
:
TRTPluginBase
(
name
)
{
deserialize_value
(
&
data
,
&
length
,
&
param
);
deserialize_value
(
&
data
,
&
length
,
&
mClipBoxes
);
}
int
TRTBatchedRotatedNMS
::
getNbOutputs
()
const
TRT_NOEXCEPT
{
return
2
;
}
nvinfer1
::
DimsExprs
TRTBatchedRotatedNMS
::
getOutputDimensions
(
int
outputIndex
,
const
nvinfer1
::
DimsExprs
*
inputs
,
int
nbInputs
,
nvinfer1
::
IExprBuilder
&
exprBuilder
)
TRT_NOEXCEPT
{
ASSERT
(
nbInputs
==
2
);
ASSERT
(
outputIndex
>=
0
&&
outputIndex
<
this
->
getNbOutputs
());
ASSERT
(
inputs
[
0
].
nbDims
==
4
);
ASSERT
(
inputs
[
1
].
nbDims
==
3
);
nvinfer1
::
DimsExprs
ret
;
ret
.
d
[
0
]
=
inputs
[
0
].
d
[
0
];
ret
.
d
[
1
]
=
exprBuilder
.
constant
(
param
.
keepTopK
);
switch
(
outputIndex
)
{
case
0
:
ret
.
nbDims
=
3
;
ret
.
d
[
2
]
=
exprBuilder
.
constant
(
6
);
break
;
case
1
:
ret
.
nbDims
=
2
;
break
;
default:
break
;
}
return
ret
;
}
size_t
TRTBatchedRotatedNMS
::
getWorkspaceSize
(
const
nvinfer1
::
PluginTensorDesc
*
inputs
,
int
nbInputs
,
const
nvinfer1
::
PluginTensorDesc
*
outputs
,
int
nbOutputs
)
const
TRT_NOEXCEPT
{
size_t
batch_size
=
inputs
[
0
].
dims
.
d
[
0
];
size_t
boxes_size
=
inputs
[
0
].
dims
.
d
[
1
]
*
inputs
[
0
].
dims
.
d
[
2
]
*
inputs
[
0
].
dims
.
d
[
3
];
size_t
score_size
=
inputs
[
1
].
dims
.
d
[
1
]
*
inputs
[
1
].
dims
.
d
[
2
];
size_t
num_priors
=
inputs
[
0
].
dims
.
d
[
1
];
bool
shareLocation
=
(
inputs
[
0
].
dims
.
d
[
2
]
==
1
);
int
topk
=
param
.
topK
>
0
&&
param
.
topK
<=
inputs
[
1
].
dims
.
d
[
1
]
?
param
.
topK
:
inputs
[
1
].
dims
.
d
[
1
];
return
detectionInferenceWorkspaceSize
(
shareLocation
,
batch_size
,
boxes_size
,
score_size
,
param
.
numClasses
,
num_priors
,
topk
,
DataType
::
kFLOAT
,
DataType
::
kFLOAT
);
}
int
TRTBatchedRotatedNMS
::
enqueue
(
const
nvinfer1
::
PluginTensorDesc
*
inputDesc
,
const
nvinfer1
::
PluginTensorDesc
*
outputDesc
,
const
void
*
const
*
inputs
,
void
*
const
*
outputs
,
void
*
workSpace
,
cudaStream_t
stream
)
TRT_NOEXCEPT
{
const
void
*
const
locData
=
inputs
[
0
];
const
void
*
const
confData
=
inputs
[
1
];
void
*
nmsedDets
=
outputs
[
0
];
void
*
nmsedLabels
=
outputs
[
1
];
size_t
batch_size
=
inputDesc
[
0
].
dims
.
d
[
0
];
size_t
boxes_size
=
inputDesc
[
0
].
dims
.
d
[
1
]
*
inputDesc
[
0
].
dims
.
d
[
2
]
*
inputDesc
[
0
].
dims
.
d
[
3
];
size_t
score_size
=
inputDesc
[
1
].
dims
.
d
[
1
]
*
inputDesc
[
1
].
dims
.
d
[
2
];
size_t
num_priors
=
inputDesc
[
0
].
dims
.
d
[
1
];
bool
shareLocation
=
(
inputDesc
[
0
].
dims
.
d
[
2
]
==
1
);
int
topk
=
param
.
topK
>
0
&&
param
.
topK
<=
inputDesc
[
1
].
dims
.
d
[
1
]
?
param
.
topK
:
inputDesc
[
1
].
dims
.
d
[
1
];
bool
rotated
=
true
;
pluginStatus_t
status
=
nmsInference
(
stream
,
batch_size
,
boxes_size
,
score_size
,
shareLocation
,
param
.
backgroundLabelId
,
num_priors
,
param
.
numClasses
,
topk
,
param
.
keepTopK
,
param
.
scoreThreshold
,
param
.
iouThreshold
,
DataType
::
kFLOAT
,
locData
,
DataType
::
kFLOAT
,
confData
,
nmsedDets
,
nmsedLabels
,
nullptr
,
workSpace
,
param
.
isNormalized
,
false
,
mClipBoxes
,
rotated
);
ASSERT
(
status
==
STATUS_SUCCESS
);
return
0
;
}
size_t
TRTBatchedRotatedNMS
::
getSerializationSize
()
const
TRT_NOEXCEPT
{
// NMSParameters,
return
sizeof
(
NMSParameters
)
+
sizeof
(
bool
);
}
void
TRTBatchedRotatedNMS
::
serialize
(
void
*
buffer
)
const
TRT_NOEXCEPT
{
serialize_value
(
&
buffer
,
param
);
serialize_value
(
&
buffer
,
mClipBoxes
);
}
void
TRTBatchedRotatedNMS
::
configurePlugin
(
const
nvinfer1
::
DynamicPluginTensorDesc
*
inputs
,
int
nbInputs
,
const
nvinfer1
::
DynamicPluginTensorDesc
*
outputs
,
int
nbOutputs
)
TRT_NOEXCEPT
{
// Validate input arguments
}
bool
TRTBatchedRotatedNMS
::
supportsFormatCombination
(
int
pos
,
const
nvinfer1
::
PluginTensorDesc
*
ioDesc
,
int
nbInputs
,
int
nbOutputs
)
TRT_NOEXCEPT
{
if
(
pos
==
3
)
{
return
ioDesc
[
pos
].
type
==
nvinfer1
::
DataType
::
kINT32
&&
ioDesc
[
pos
].
format
==
nvinfer1
::
TensorFormat
::
kLINEAR
;
}
return
ioDesc
[
pos
].
type
==
nvinfer1
::
DataType
::
kFLOAT
&&
ioDesc
[
pos
].
format
==
nvinfer1
::
TensorFormat
::
kLINEAR
;
}
const
char
*
TRTBatchedRotatedNMS
::
getPluginType
()
const
TRT_NOEXCEPT
{
return
NMS_PLUGIN_NAME
;
}
const
char
*
TRTBatchedRotatedNMS
::
getPluginVersion
()
const
TRT_NOEXCEPT
{
return
NMS_PLUGIN_VERSION
;
}
IPluginV2DynamicExt
*
TRTBatchedRotatedNMS
::
clone
()
const
TRT_NOEXCEPT
{
auto
*
plugin
=
new
TRTBatchedRotatedNMS
(
mLayerName
,
param
);
plugin
->
setPluginNamespace
(
mNamespace
.
c_str
());
plugin
->
setClipParam
(
mClipBoxes
);
return
plugin
;
}
nvinfer1
::
DataType
TRTBatchedRotatedNMS
::
getOutputDataType
(
int
index
,
const
nvinfer1
::
DataType
*
inputTypes
,
int
nbInputs
)
const
TRT_NOEXCEPT
{
ASSERT
(
index
>=
0
&&
index
<
this
->
getNbOutputs
());
if
(
index
==
1
)
{
return
nvinfer1
::
DataType
::
kINT32
;
}
return
inputTypes
[
0
];
}
void
TRTBatchedRotatedNMS
::
setClipParam
(
bool
clip
)
{
mClipBoxes
=
clip
;
}
TRTBatchedRotatedNMSCreator
::
TRTBatchedRotatedNMSCreator
()
{
mPluginAttributes
.
emplace_back
(
PluginField
(
"background_label_id"
,
nullptr
,
PluginFieldType
::
kINT32
,
1
));
mPluginAttributes
.
emplace_back
(
PluginField
(
"num_classes"
,
nullptr
,
PluginFieldType
::
kINT32
,
1
));
mPluginAttributes
.
emplace_back
(
PluginField
(
"topk"
,
nullptr
,
PluginFieldType
::
kINT32
,
1
));
mPluginAttributes
.
emplace_back
(
PluginField
(
"keep_topk"
,
nullptr
,
PluginFieldType
::
kINT32
,
1
));
mPluginAttributes
.
emplace_back
(
PluginField
(
"score_threshold"
,
nullptr
,
PluginFieldType
::
kFLOAT32
,
1
));
mPluginAttributes
.
emplace_back
(
PluginField
(
"iou_threshold"
,
nullptr
,
PluginFieldType
::
kFLOAT32
,
1
));
mPluginAttributes
.
emplace_back
(
PluginField
(
"is_normalized"
,
nullptr
,
PluginFieldType
::
kINT32
,
1
));
mPluginAttributes
.
emplace_back
(
PluginField
(
"clip_boxes"
,
nullptr
,
PluginFieldType
::
kINT32
,
1
));
mFC
.
nbFields
=
mPluginAttributes
.
size
();
mFC
.
fields
=
mPluginAttributes
.
data
();
}
const
char
*
TRTBatchedRotatedNMSCreator
::
getPluginName
()
const
TRT_NOEXCEPT
{
return
NMS_PLUGIN_NAME
;
}
const
char
*
TRTBatchedRotatedNMSCreator
::
getPluginVersion
()
const
TRT_NOEXCEPT
{
return
NMS_PLUGIN_VERSION
;
}
IPluginV2Ext
*
TRTBatchedRotatedNMSCreator
::
createPlugin
(
const
char
*
name
,
const
PluginFieldCollection
*
fc
)
TRT_NOEXCEPT
{
const
PluginField
*
fields
=
fc
->
fields
;
bool
clipBoxes
=
true
;
nvinfer1
::
plugin
::
NMSParameters
params
{};
for
(
int
i
=
0
;
i
<
fc
->
nbFields
;
++
i
)
{
const
char
*
attrName
=
fields
[
i
].
name
;
if
(
!
strcmp
(
attrName
,
"background_label_id"
))
{
ASSERT
(
fields
[
i
].
type
==
PluginFieldType
::
kINT32
);
params
.
backgroundLabelId
=
*
(
static_cast
<
const
int
*>
(
fields
[
i
].
data
));
}
else
if
(
!
strcmp
(
attrName
,
"num_classes"
))
{
ASSERT
(
fields
[
i
].
type
==
PluginFieldType
::
kINT32
);
params
.
numClasses
=
*
(
static_cast
<
const
int
*>
(
fields
[
i
].
data
));
}
else
if
(
!
strcmp
(
attrName
,
"topk"
))
{
ASSERT
(
fields
[
i
].
type
==
PluginFieldType
::
kINT32
);
params
.
topK
=
*
(
static_cast
<
const
int
*>
(
fields
[
i
].
data
));
}
else
if
(
!
strcmp
(
attrName
,
"keep_topk"
))
{
ASSERT
(
fields
[
i
].
type
==
PluginFieldType
::
kINT32
);
params
.
keepTopK
=
*
(
static_cast
<
const
int
*>
(
fields
[
i
].
data
));
}
else
if
(
!
strcmp
(
attrName
,
"score_threshold"
))
{
ASSERT
(
fields
[
i
].
type
==
PluginFieldType
::
kFLOAT32
);
params
.
scoreThreshold
=
*
(
static_cast
<
const
float
*>
(
fields
[
i
].
data
));
}
else
if
(
!
strcmp
(
attrName
,
"iou_threshold"
))
{
ASSERT
(
fields
[
i
].
type
==
PluginFieldType
::
kFLOAT32
);
params
.
iouThreshold
=
*
(
static_cast
<
const
float
*>
(
fields
[
i
].
data
));
}
else
if
(
!
strcmp
(
attrName
,
"is_normalized"
))
{
params
.
isNormalized
=
*
(
static_cast
<
const
bool
*>
(
fields
[
i
].
data
));
}
else
if
(
!
strcmp
(
attrName
,
"clip_boxes"
))
{
clipBoxes
=
*
(
static_cast
<
const
bool
*>
(
fields
[
i
].
data
));
}
}
TRTBatchedRotatedNMS
*
plugin
=
new
TRTBatchedRotatedNMS
(
name
,
params
);
plugin
->
setClipParam
(
clipBoxes
);
plugin
->
setPluginNamespace
(
mNamespace
.
c_str
());
return
plugin
;
}
IPluginV2Ext
*
TRTBatchedRotatedNMSCreator
::
deserializePlugin
(
const
char
*
name
,
const
void
*
serialData
,
size_t
serialLength
)
TRT_NOEXCEPT
{
// This object will be deleted when the network is destroyed, which will
// call NMS::destroy()
TRTBatchedRotatedNMS
*
plugin
=
new
TRTBatchedRotatedNMS
(
name
,
serialData
,
serialLength
);
plugin
->
setPluginNamespace
(
mNamespace
.
c_str
());
return
plugin
;
}
REGISTER_TENSORRT_PLUGIN
(
TRTBatchedRotatedNMSCreator
);
}
// namespace mmdeploy
csrc/mmdeploy/backend_ops/tensorrt/batched_rotated_nms/trt_batched_rotated_nms.hpp
0 → 100644
View file @
546b4279
// Copyright (c) OpenMMLab. All rights reserved
#ifndef TRT_BATCHED_ROTATED_NMS_HPP
#define TRT_BATCHED_ROTATED_NMS_HPP
#include <string>
#include <vector>
#include "NvInferPluginUtils.h"
#include "trt_plugin_base.hpp"
namespace
mmdeploy
{
class
TRTBatchedRotatedNMS
:
public
TRTPluginBase
{
public:
TRTBatchedRotatedNMS
(
const
std
::
string
&
name
,
nvinfer1
::
plugin
::
NMSParameters
param
);
TRTBatchedRotatedNMS
(
const
std
::
string
&
name
,
const
void
*
data
,
size_t
length
);
~
TRTBatchedRotatedNMS
()
TRT_NOEXCEPT
override
=
default
;
int
getNbOutputs
()
const
TRT_NOEXCEPT
override
;
nvinfer1
::
DimsExprs
getOutputDimensions
(
int
outputIndex
,
const
nvinfer1
::
DimsExprs
*
inputs
,
int
nbInputs
,
nvinfer1
::
IExprBuilder
&
exprBuilder
)
TRT_NOEXCEPT
override
;
size_t
getWorkspaceSize
(
const
nvinfer1
::
PluginTensorDesc
*
inputs
,
int
nbInputs
,
const
nvinfer1
::
PluginTensorDesc
*
outputs
,
int
nbOutputs
)
const
TRT_NOEXCEPT
override
;
int
enqueue
(
const
nvinfer1
::
PluginTensorDesc
*
inputDesc
,
const
nvinfer1
::
PluginTensorDesc
*
outputDesc
,
const
void
*
const
*
inputs
,
void
*
const
*
outputs
,
void
*
workSpace
,
cudaStream_t
stream
)
TRT_NOEXCEPT
override
;
size_t
getSerializationSize
()
const
TRT_NOEXCEPT
override
;
void
serialize
(
void
*
buffer
)
const
TRT_NOEXCEPT
override
;
void
configurePlugin
(
const
nvinfer1
::
DynamicPluginTensorDesc
*
inputs
,
int
nbInputs
,
const
nvinfer1
::
DynamicPluginTensorDesc
*
outputs
,
int
nbOutputs
)
TRT_NOEXCEPT
override
;
bool
supportsFormatCombination
(
int
pos
,
const
nvinfer1
::
PluginTensorDesc
*
ioDesc
,
int
nbInputs
,
int
nbOutputs
)
TRT_NOEXCEPT
override
;
const
char
*
getPluginType
()
const
TRT_NOEXCEPT
override
;
const
char
*
getPluginVersion
()
const
TRT_NOEXCEPT
override
;
nvinfer1
::
IPluginV2DynamicExt
*
clone
()
const
TRT_NOEXCEPT
override
;
nvinfer1
::
DataType
getOutputDataType
(
int
index
,
const
nvinfer1
::
DataType
*
inputType
,
int
nbInputs
)
const
TRT_NOEXCEPT
override
;
void
setClipParam
(
bool
clip
);
private:
nvinfer1
::
plugin
::
NMSParameters
param
{};
bool
mClipBoxes
{};
};
class
TRTBatchedRotatedNMSCreator
:
public
TRTPluginCreatorBase
{
public:
TRTBatchedRotatedNMSCreator
();
~
TRTBatchedRotatedNMSCreator
()
TRT_NOEXCEPT
override
=
default
;
const
char
*
getPluginName
()
const
TRT_NOEXCEPT
override
;
const
char
*
getPluginVersion
()
const
TRT_NOEXCEPT
override
;
nvinfer1
::
IPluginV2Ext
*
createPlugin
(
const
char
*
name
,
const
nvinfer1
::
PluginFieldCollection
*
fc
)
TRT_NOEXCEPT
override
;
nvinfer1
::
IPluginV2Ext
*
deserializePlugin
(
const
char
*
name
,
const
void
*
serialData
,
size_t
serialLength
)
TRT_NOEXCEPT
override
;
};
}
// namespace mmdeploy
#endif
csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate.cpp
0 → 100644
View file @
546b4279
// Copyright (c) OpenMMLab. All rights reserved
#include "trt_bicubic_interpolate.hpp"
#include <assert.h>
#include <chrono>
#include "trt_bicubic_interpolate_kernel.hpp"
#include "trt_plugin_helper.hpp"
#include "trt_serialize.hpp"
using
namespace
nvinfer1
;
namespace
mmdeploy
{
namespace
{
static
const
char
*
PLUGIN_VERSION
{
"1"
};
static
const
char
*
PLUGIN_NAME
{
"TRTBicubicInterpolate"
};
}
// namespace
TRTBicubicInterpolate
::
TRTBicubicInterpolate
(
const
std
::
string
&
name
,
std
::
vector
<
float
>
scale_factor
,
bool
align_corners
)
:
TRTPluginBase
(
name
),
mScaleFactor
(
scale_factor
),
mAlignCorners
(
align_corners
)
{}
TRTBicubicInterpolate
::
TRTBicubicInterpolate
(
const
std
::
string
name
,
const
void
*
data
,
size_t
length
)
:
TRTPluginBase
(
name
)
{
deserialize_value
(
&
data
,
&
length
,
&
mScaleFactor
);
deserialize_value
(
&
data
,
&
length
,
&
mAlignCorners
);
}
nvinfer1
::
IPluginV2DynamicExt
*
TRTBicubicInterpolate
::
clone
()
const
TRT_NOEXCEPT
{
TRTBicubicInterpolate
*
plugin
=
new
TRTBicubicInterpolate
(
mLayerName
,
mScaleFactor
,
mAlignCorners
);
plugin
->
setPluginNamespace
(
getPluginNamespace
());
return
plugin
;
}
nvinfer1
::
DimsExprs
TRTBicubicInterpolate
::
getOutputDimensions
(
int
outputIndex
,
const
nvinfer1
::
DimsExprs
*
inputs
,
int
nbInputs
,
nvinfer1
::
IExprBuilder
&
exprBuilder
)
TRT_NOEXCEPT
{
nvinfer1
::
DimsExprs
ret
;
ret
.
nbDims
=
4
;
ret
.
d
[
0
]
=
inputs
[
0
].
d
[
0
];
ret
.
d
[
1
]
=
inputs
[
0
].
d
[
1
];
auto
height
=
exprBuilder
.
constant
(
mScaleFactor
[
0
]);
auto
width
=
exprBuilder
.
constant
(
mScaleFactor
[
1
]);
auto
d2
=
exprBuilder
.
operation
(
DimensionOperation
::
kPROD
,
*
inputs
[
0
].
d
[
2
],
*
height
);
auto
d3
=
exprBuilder
.
operation
(
DimensionOperation
::
kPROD
,
*
inputs
[
0
].
d
[
3
],
*
width
);
ret
.
d
[
2
]
=
d2
;
ret
.
d
[
3
]
=
d3
;
return
ret
;
}
bool
TRTBicubicInterpolate
::
supportsFormatCombination
(
int
pos
,
const
nvinfer1
::
PluginTensorDesc
*
ioDesc
,
int
nbInputs
,
int
nbOutputs
)
TRT_NOEXCEPT
{
if
(
pos
==
0
)
{
return
(
ioDesc
[
pos
].
type
==
nvinfer1
::
DataType
::
kFLOAT
&&
ioDesc
[
pos
].
format
==
nvinfer1
::
TensorFormat
::
kLINEAR
);
}
else
{
return
ioDesc
[
pos
].
type
==
ioDesc
[
0
].
type
&&
ioDesc
[
pos
].
format
==
ioDesc
[
0
].
format
;
}
}
void
TRTBicubicInterpolate
::
configurePlugin
(
const
nvinfer1
::
DynamicPluginTensorDesc
*
inputs
,
int
nbInputs
,
const
nvinfer1
::
DynamicPluginTensorDesc
*
outputs
,
int
nbOutputs
)
TRT_NOEXCEPT
{}
size_t
TRTBicubicInterpolate
::
getWorkspaceSize
(
const
nvinfer1
::
PluginTensorDesc
*
inputs
,
int
nbInputs
,
const
nvinfer1
::
PluginTensorDesc
*
outputs
,
int
nbOutputs
)
const
TRT_NOEXCEPT
{
return
0
;
}
int
TRTBicubicInterpolate
::
enqueue
(
const
nvinfer1
::
PluginTensorDesc
*
inputDesc
,
const
nvinfer1
::
PluginTensorDesc
*
outputDesc
,
const
void
*
const
*
inputs
,
void
*
const
*
outputs
,
void
*
workSpace
,
cudaStream_t
stream
)
TRT_NOEXCEPT
{
int
batch
=
inputDesc
[
0
].
dims
.
d
[
0
];
int
channels
=
inputDesc
[
0
].
dims
.
d
[
1
];
int
height
=
inputDesc
[
0
].
dims
.
d
[
2
];
int
width
=
inputDesc
[
0
].
dims
.
d
[
3
];
int
height_out
=
outputDesc
[
0
].
dims
.
d
[
2
];
int
width_out
=
outputDesc
[
0
].
dims
.
d
[
3
];
const
void
*
x
=
inputs
[
0
];
void
*
output
=
outputs
[
0
];
// TODO: add fp16 support
auto
data_type
=
inputDesc
[
0
].
type
;
switch
(
data_type
)
{
case
nvinfer1
::
DataType
::
kFLOAT
:
bicubic_interpolate
<
float
>
((
float
*
)
x
,
(
float
*
)
output
,
batch
,
channels
,
height
,
width
,
height_out
,
width_out
,
mAlignCorners
,
stream
);
break
;
default:
return
1
;
break
;
}
return
0
;
}
nvinfer1
::
DataType
TRTBicubicInterpolate
::
getOutputDataType
(
int
index
,
const
nvinfer1
::
DataType
*
inputTypes
,
int
nbInputs
)
const
TRT_NOEXCEPT
{
return
inputTypes
[
0
];
}
// IPluginV2 Methods
const
char
*
TRTBicubicInterpolate
::
getPluginType
()
const
TRT_NOEXCEPT
{
return
PLUGIN_NAME
;
}
const
char
*
TRTBicubicInterpolate
::
getPluginVersion
()
const
TRT_NOEXCEPT
{
return
PLUGIN_VERSION
;
}
int
TRTBicubicInterpolate
::
getNbOutputs
()
const
TRT_NOEXCEPT
{
return
1
;
}
size_t
TRTBicubicInterpolate
::
getSerializationSize
()
const
TRT_NOEXCEPT
{
return
serialized_size
(
mScaleFactor
)
+
serialized_size
(
mAlignCorners
);
}
void
TRTBicubicInterpolate
::
serialize
(
void
*
buffer
)
const
TRT_NOEXCEPT
{
serialize_value
(
&
buffer
,
mScaleFactor
);
serialize_value
(
&
buffer
,
mAlignCorners
);
}
////////////////////// creator /////////////////////////////
TRTBicubicInterpolateCreator
::
TRTBicubicInterpolateCreator
()
{
mPluginAttributes
.
clear
();
mPluginAttributes
.
emplace_back
(
nvinfer1
::
PluginField
(
"scale_factor"
));
mPluginAttributes
.
emplace_back
(
nvinfer1
::
PluginField
(
"align_corners"
));
mFC
.
nbFields
=
mPluginAttributes
.
size
();
mFC
.
fields
=
mPluginAttributes
.
data
();
}
const
char
*
TRTBicubicInterpolateCreator
::
getPluginName
()
const
TRT_NOEXCEPT
{
return
PLUGIN_NAME
;
}
const
char
*
TRTBicubicInterpolateCreator
::
getPluginVersion
()
const
TRT_NOEXCEPT
{
return
PLUGIN_VERSION
;
}
nvinfer1
::
IPluginV2
*
TRTBicubicInterpolateCreator
::
createPlugin
(
const
char
*
name
,
const
nvinfer1
::
PluginFieldCollection
*
fc
)
TRT_NOEXCEPT
{
nvinfer1
::
Dims
size
{
2
,
{
1
,
1
}};
std
::
vector
<
float
>
scale_factor
;
bool
align_corners
=
1
;
for
(
int
i
=
0
;
i
<
fc
->
nbFields
;
i
++
)
{
if
(
fc
->
fields
[
i
].
data
==
nullptr
)
{
continue
;
}
std
::
string
field_name
(
fc
->
fields
[
i
].
name
);
if
(
field_name
.
compare
(
"scale_factor"
)
==
0
)
{
int
data_size
=
(
fc
->
fields
[
i
].
length
);
if
(
data_size
!=
2
)
{
data_size
=
data_size
/
sizeof
(
float
);
}
ASSERT
(
data_size
==
2
)
const
float
*
data_start
=
static_cast
<
const
float
*>
(
fc
->
fields
[
i
].
data
);
scale_factor
=
std
::
vector
<
float
>
(
data_start
,
data_start
+
data_size
);
}
if
(
field_name
.
compare
(
"align_corners"
)
==
0
)
{
align_corners
=
static_cast
<
const
int
*>
(
fc
->
fields
[
i
].
data
)[
0
];
}
}
TRTBicubicInterpolate
*
plugin
=
new
TRTBicubicInterpolate
(
name
,
scale_factor
,
align_corners
);
plugin
->
setPluginNamespace
(
getPluginNamespace
());
return
plugin
;
}
nvinfer1
::
IPluginV2
*
TRTBicubicInterpolateCreator
::
deserializePlugin
(
const
char
*
name
,
const
void
*
serialData
,
size_t
serialLength
)
TRT_NOEXCEPT
{
auto
plugin
=
new
TRTBicubicInterpolate
(
name
,
serialData
,
serialLength
);
plugin
->
setPluginNamespace
(
getPluginNamespace
());
return
plugin
;
}
REGISTER_TENSORRT_PLUGIN
(
TRTBicubicInterpolateCreator
);
}
// namespace mmdeploy
csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate.hpp
0 → 100644
View file @
546b4279
#ifndef TRT_BICUBIC_INTERPOLATE_HPP
#define TRT_BICUBIC_INTERPOLATE_HPP
#include <cublas_v2.h>
#include <memory>
#include <string>
#include <vector>
#include "trt_plugin_base.hpp"
namespace
mmdeploy
{
class
TRTBicubicInterpolate
:
public
TRTPluginBase
{
public:
TRTBicubicInterpolate
(
const
std
::
string
&
name
,
std
::
vector
<
float
>
scale_factor
,
bool
align_corners
);
TRTBicubicInterpolate
(
const
std
::
string
name
,
const
void
*
data
,
size_t
length
);
TRTBicubicInterpolate
()
=
delete
;
// IPluginV2DynamicExt Methods
nvinfer1
::
IPluginV2DynamicExt
*
clone
()
const
TRT_NOEXCEPT
override
;
nvinfer1
::
DimsExprs
getOutputDimensions
(
int
outputIndex
,
const
nvinfer1
::
DimsExprs
*
inputs
,
int
nbInputs
,
nvinfer1
::
IExprBuilder
&
exprBuilder
)
TRT_NOEXCEPT
override
;
bool
supportsFormatCombination
(
int
pos
,
const
nvinfer1
::
PluginTensorDesc
*
ioDesc
,
int
nbInputs
,
int
nbOutputs
)
TRT_NOEXCEPT
override
;
void
configurePlugin
(
const
nvinfer1
::
DynamicPluginTensorDesc
*
in
,
int
nbInputs
,
const
nvinfer1
::
DynamicPluginTensorDesc
*
out
,
int
nbOutputs
)
TRT_NOEXCEPT
override
;
size_t
getWorkspaceSize
(
const
nvinfer1
::
PluginTensorDesc
*
inputs
,
int
nbInputs
,
const
nvinfer1
::
PluginTensorDesc
*
outputs
,
int
nbOutputs
)
const
TRT_NOEXCEPT
override
;
int
enqueue
(
const
nvinfer1
::
PluginTensorDesc
*
inputDesc
,
const
nvinfer1
::
PluginTensorDesc
*
outputDesc
,
const
void
*
const
*
inputs
,
void
*
const
*
outputs
,
void
*
workspace
,
cudaStream_t
stream
)
TRT_NOEXCEPT
override
;
// IPluginV2Ext Methods
nvinfer1
::
DataType
getOutputDataType
(
int
index
,
const
nvinfer1
::
DataType
*
inputTypes
,
int
nbInputs
)
const
TRT_NOEXCEPT
override
;
// IPluginV2 Methods
const
char
*
getPluginType
()
const
TRT_NOEXCEPT
override
;
const
char
*
getPluginVersion
()
const
TRT_NOEXCEPT
override
;
int
getNbOutputs
()
const
TRT_NOEXCEPT
override
;
size_t
getSerializationSize
()
const
TRT_NOEXCEPT
override
;
void
serialize
(
void
*
buffer
)
const
TRT_NOEXCEPT
override
;
private:
std
::
vector
<
float
>
mScaleFactor
;
bool
mAlignCorners
;
};
class
TRTBicubicInterpolateCreator
:
public
TRTPluginCreatorBase
{
public:
TRTBicubicInterpolateCreator
();
const
char
*
getPluginName
()
const
TRT_NOEXCEPT
override
;
const
char
*
getPluginVersion
()
const
TRT_NOEXCEPT
override
;
nvinfer1
::
IPluginV2
*
createPlugin
(
const
char
*
name
,
const
nvinfer1
::
PluginFieldCollection
*
fc
)
TRT_NOEXCEPT
override
;
nvinfer1
::
IPluginV2
*
deserializePlugin
(
const
char
*
name
,
const
void
*
serialData
,
size_t
serialLength
)
TRT_NOEXCEPT
override
;
};
}
// namespace mmdeploy
#endif // TRT_BICUBIC_INTERPOLATE_HPP
csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate_kernel.cu
0 → 100644
View file @
546b4279
// Modified from
// https://github.com/pytorch/pytorch/blob/6adbe044e39c8e8db158d91e151aa6dead6e9aa4/aten/src/ATen/native/cuda/UpSampleBicubic2d.cu
#include <cuda_fp16.h>
#include <stdio.h>
#include <algorithm>
#include <cmath>
#include <vector>
#include "common_cuda_helper.hpp"
#include "trt_bicubic_interpolate_kernel.hpp"
// Based on
// https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm
template
<
typename
scalar_t
>
__device__
__forceinline__
static
scalar_t
cubic_convolution1
(
scalar_t
x
,
scalar_t
A
)
{
return
((
A
+
2
)
*
x
-
(
A
+
3
))
*
x
*
x
+
1
;
}
template
<
typename
scalar_t
>
__device__
__forceinline__
static
scalar_t
cubic_convolution2
(
scalar_t
x
,
scalar_t
A
)
{
return
((
A
*
x
-
5
*
A
)
*
x
+
8
*
A
)
*
x
-
4
*
A
;
}
template
<
typename
scalar_t
>
__device__
__forceinline__
static
void
get_cubic_upsample_coefficients
(
scalar_t
coeffs
[
4
],
scalar_t
t
)
{
scalar_t
A
=
-
0.75
;
scalar_t
x1
=
t
;
coeffs
[
0
]
=
cubic_convolution2
<
scalar_t
>
(
x1
+
1.0
,
A
);
coeffs
[
1
]
=
cubic_convolution1
<
scalar_t
>
(
x1
,
A
);
// opposite coefficients
scalar_t
x2
=
1.0
-
t
;
coeffs
[
2
]
=
cubic_convolution1
<
scalar_t
>
(
x2
,
A
);
coeffs
[
3
]
=
cubic_convolution2
<
scalar_t
>
(
x2
+
1.0
,
A
);
}
template
<
typename
scalar_t
>
__device__
__forceinline__
static
scalar_t
cubic_interp1d
(
scalar_t
x0
,
scalar_t
x1
,
scalar_t
x2
,
scalar_t
x3
,
scalar_t
t
)
{
scalar_t
coeffs
[
4
];
get_cubic_upsample_coefficients
<
scalar_t
>
(
coeffs
,
t
);
return
x0
*
coeffs
[
0
]
+
x1
*
coeffs
[
1
]
+
x2
*
coeffs
[
2
]
+
x3
*
coeffs
[
3
];
}
/* Used by UpSampleBicubic2d.cu */
template
<
typename
scalar_t
>
__device__
__forceinline__
static
scalar_t
upsample_get_value_bounded
(
const
scalar_t
*
data
,
int
batch
,
int
channel
,
int
batchsize
,
int
channels
,
int
height
,
int
width
,
int
y
,
int
x
)
{
int
access_y
=
max
(
min
(
y
,
height
-
1
),
0
);
int
access_x
=
max
(
min
(
x
,
width
-
1
),
0
);
return
data
[
batch
*
channels
*
height
*
width
+
channel
*
height
*
width
+
access_y
*
width
+
access_x
];
}
template
<
typename
scalar_t
>
__device__
__forceinline__
scalar_t
area_pixel_compute_source_index
(
scalar_t
scale
,
int64_t
dst_index
,
bool
align_corners
,
bool
cubic
)
{
if
(
align_corners
)
{
return
scale
*
dst_index
;
}
else
{
scalar_t
src_idx
=
scale
*
(
dst_index
+
0.5
)
-
0.5
;
// [Note] Follow Opencv resize logic:
// We allow negative src_idx here and later will use
// dx = src_idx - floorf(src_idx)
// to compute the "distance"(which affects weights).
// For linear modes, weight distribution doesn't matter
// for negative indices as they use 2 pixels to interpolate.
// For example, [-1, 0], they both use pixel 0 value so it
// doesn't affect if we bound the src_idx to 0 or not.
// TODO: Our current linear mode impls use unbound indices
// where we should and then remove this cubic flag.
// This matters in cubic mode, as we might need [-1, 0, 1, 2]
// to interpolate and the weights can be affected.
return
(
!
cubic
&&
src_idx
<
0
)
?
scalar_t
(
0
)
:
src_idx
;
}
}
// cubic interpolation pytorch
template
<
typename
scalar_t
>
__global__
void
resize_cubic_kernel_torch
(
const
int
num_elements
,
const
scalar_t
*
src
,
const
int
batchsize
,
const
int
channels
,
int
srcWidth
,
int
srcHeight
,
scalar_t
*
dst
,
int
dstWidth
,
int
dstHeight
,
bool
align_corners
,
float
height_scale
,
float
width_scale
)
{
CUDA_1D_KERNEL_LOOP
(
index
,
num_elements
)
{
// Special case: input and output are the same size, just copy
const
int
output_x
=
index
%
dstWidth
;
const
int
output_y
=
index
/
dstWidth
;
if
(
srcHeight
==
dstHeight
&&
srcWidth
==
dstWidth
)
{
for
(
int
n
=
0
;
n
<
batchsize
;
n
++
)
{
for
(
int
c
=
0
;
c
<
channels
;
c
++
)
{
const
scalar_t
val
=
src
[
n
*
channels
*
dstHeight
*
dstWidth
+
c
*
dstHeight
*
dstWidth
+
output_y
*
dstWidth
+
output_x
];
dst
[
n
*
channels
*
dstHeight
*
dstWidth
+
c
*
dstHeight
*
dstWidth
+
output_y
*
dstWidth
+
output_x
]
=
val
;
}
}
return
;
}
// Interpolation kernel
scalar_t
real_x
=
area_pixel_compute_source_index
(
width_scale
,
output_x
,
align_corners
,
/*cubic=*/
true
);
int
in_x
=
floorf
(
real_x
);
scalar_t
t_x
=
real_x
-
in_x
;
scalar_t
real_y
=
area_pixel_compute_source_index
(
height_scale
,
output_y
,
align_corners
,
/*cubic=*/
true
);
int
in_y
=
floorf
(
real_y
);
scalar_t
t_y
=
real_y
-
in_y
;
for
(
int
n
=
0
;
n
<
batchsize
;
n
++
)
{
for
(
int
c
=
0
;
c
<
channels
;
c
++
)
{
scalar_t
coefficients
[
4
];
for
(
int
k
=
0
;
k
<
4
;
k
++
)
{
coefficients
[
k
]
=
cubic_interp1d
<
scalar_t
>
(
upsample_get_value_bounded
(
src
,
n
,
c
,
batchsize
,
channels
,
srcHeight
,
srcWidth
,
in_y
-
1
+
k
,
in_x
-
1
),
upsample_get_value_bounded
(
src
,
n
,
c
,
batchsize
,
channels
,
srcHeight
,
srcWidth
,
in_y
-
1
+
k
,
in_x
+
0
),
upsample_get_value_bounded
(
src
,
n
,
c
,
batchsize
,
channels
,
srcHeight
,
srcWidth
,
in_y
-
1
+
k
,
in_x
+
1
),
upsample_get_value_bounded
(
src
,
n
,
c
,
batchsize
,
channels
,
srcHeight
,
srcWidth
,
in_y
-
1
+
k
,
in_x
+
2
),
t_x
);
}
dst
[
n
*
channels
*
dstHeight
*
dstWidth
+
c
*
dstHeight
*
dstWidth
+
output_y
*
dstWidth
+
output_x
]
=
scalar_t
(
cubic_interp1d
(
coefficients
[
0
],
coefficients
[
1
],
coefficients
[
2
],
coefficients
[
3
],
t_y
));
}
}
}
}
template
<
typename
scalar_t
>
void
resizeGPU
(
const
scalar_t
*
pIn_d
,
scalar_t
*
pOut_d
,
int
batch
,
int
channels
,
int
srcWidth
,
int
srcHeight
,
int
dstWidth
,
int
dstHeight
,
bool
align_corners
,
cudaStream_t
stream
)
{
float
height_scale
=
float
(
srcHeight
)
/
dstHeight
;
float
width_scale
=
float
(
srcWidth
)
/
dstWidth
;
if
(
align_corners
&&
dstWidth
>
1
&&
dstHeight
>
1
)
{
height_scale
=
(
float
)(
srcHeight
-
1
)
/
(
dstHeight
-
1
);
width_scale
=
(
float
)(
srcWidth
-
1
)
/
(
dstWidth
-
1
);
}
int
n
=
batch
*
dstWidth
*
dstHeight
*
channels
;
resize_cubic_kernel_torch
<<<
GET_BLOCKS
(
n
),
THREADS_PER_BLOCK
,
0
,
stream
>>>
(
dstWidth
*
dstHeight
,
pIn_d
,
batch
,
channels
,
srcWidth
,
srcHeight
,
pOut_d
,
dstWidth
,
dstHeight
,
align_corners
,
height_scale
,
width_scale
);
}
template
<
typename
scalar_t
>
void
bicubic_interpolate
(
const
scalar_t
*
input
,
scalar_t
*
output
,
int
batch
,
int
channels
,
int
in_height
,
int
in_width
,
int
out_height
,
int
out_width
,
bool
align_corners
,
cudaStream_t
stream
)
{
resizeGPU
(
input
,
output
,
batch
,
channels
,
in_width
,
in_height
,
out_width
,
out_height
,
align_corners
,
stream
);
}
template
void
bicubic_interpolate
<
float
>(
const
float
*
input
,
float
*
output
,
int
batch
,
int
channels
,
int
in_height
,
int
in_width
,
int
out_height
,
int
out_width
,
bool
align_corners
,
cudaStream_t
stream
);
csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate_kernel.hpp
0 → 100644
View file @
546b4279
#ifndef TRT_BICUBIC_INTERPOLATE_KERNEL_HPP
#define TRT_BICUBIC_INTERPOLATE_KERNEL_HPP
#include <cuda_runtime.h>
#include "common_cuda_helper.hpp"
template
<
typename
scalar_t
>
void
bicubic_interpolate
(
const
scalar_t
*
input
,
scalar_t
*
output
,
int
batch
,
int
channels
,
int
in_height
,
int
in_width
,
int
out_height
,
int
out_width
,
bool
align_corners
,
cudaStream_t
stream
);
#endif // TRT_BICUBIC_INTERPOLATE_KERNEL_HPP
csrc/mmdeploy/backend_ops/tensorrt/common/common_cuda_helper.hpp
0 → 100644
View file @
546b4279
// Copyright (c) OpenMMLab. All rights reserved.
#ifndef COMMON_CUDA_HELPER
#define COMMON_CUDA_HELPER
#include <cublas_v2.h>
#include <cuda.h>
#include <stdio.h>
#include <algorithm>
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
#define THREADS_PER_BLOCK 512
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
inline
int
GET_BLOCKS
(
const
int
N
)
{
int
optimal_block_num
=
DIVUP
(
N
,
THREADS_PER_BLOCK
);
int
max_block_num
=
4096
;
return
std
::
min
(
optimal_block_num
,
max_block_num
);
}
#define cudaCheckError() \
{ \
cudaError_t e = cudaGetLastError(); \
if (e != cudaSuccess) { \
printf("Cuda failure %s:%d: '%s'\n", __FILE__, __LINE__, cudaGetErrorString(e)); \
exit(0); \
} \
}
/**
* Returns a view of the original tensor with its dimensions permuted.
*
* @param[out] dst pointer to the destination tensor
* @param[in] src pointer to the source tensor
* @param[in] src_size shape of the src tensor
* @param[in] permute The desired ordering of dimensions
* @param[in] src_dim dim of src tensor
* @param[in] stream cuda stream handle
*/
template
<
class
scalar_t
>
void
memcpyPermute
(
scalar_t
*
dst
,
const
scalar_t
*
src
,
int
*
src_size
,
int
*
permute
,
int
src_dim
,
cudaStream_t
stream
=
0
);
template
<
typename
scalar_t
>
cublasStatus_t
cublasGemmWrap
(
cublasHandle_t
handle
,
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
m
,
int
n
,
int
k
,
const
scalar_t
*
alpha
,
const
scalar_t
*
A
,
int
lda
,
const
scalar_t
*
B
,
int
ldb
,
const
scalar_t
*
beta
,
scalar_t
*
C
,
int
ldc
);
template
<
typename
scalar_t
>
__device__
__forceinline__
scalar_t
bilinear_interpolate
(
const
scalar_t
*
__restrict__
input
,
const
int
height
,
const
int
width
,
scalar_t
y
,
scalar_t
x
)
{
// deal with cases that inverse elements are out of feature map boundary
if
(
y
<
-
1.0
||
y
>
height
||
x
<
-
1.0
||
x
>
width
)
return
0
;
y
=
min
(
scalar_t
(
height
-
1
),
max
(
scalar_t
(
0
),
y
));
x
=
min
(
scalar_t
(
width
-
1
),
max
(
scalar_t
(
0
),
x
));
const
int
y_low
=
floor
(
y
);
const
int
x_low
=
floor
(
x
);
const
int
y_high
=
ceil
(
y
);
const
int
x_high
=
ceil
(
x
);
const
scalar_t
v1
=
input
[
y_low
*
width
+
x_low
];
const
scalar_t
v2
=
input
[
y_low
*
width
+
x_high
];
const
scalar_t
v3
=
input
[
y_high
*
width
+
x_low
];
const
scalar_t
v4
=
input
[
y_high
*
width
+
x_high
];
// lerp can be performed by fma
const
scalar_t
ly
=
y
-
y_low
;
const
scalar_t
lx
=
x
-
x_low
;
const
scalar_t
v_low
=
fma
(
v2
-
v1
,
lx
,
v1
);
const
scalar_t
v_high
=
fma
(
v4
-
v3
,
lx
,
v3
);
const
scalar_t
val
=
fma
(
v_high
-
v_low
,
ly
,
v_low
);
return
val
;
}
#endif // COMMON_CUDA_HELPER
csrc/mmdeploy/backend_ops/tensorrt/common/nms/batched_nms_kernel.hpp
0 → 100644
View file @
546b4279
// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
// modify from
// https://github.com/NVIDIA/TensorRT/tree/master/plugin/batchedNMSPlugin
#ifndef TRT_BATCHED_NMS_KERNEL_HPP
#define TRT_BATCHED_NMS_KERNEL_HPP
#include "cuda_runtime_api.h"
#include "kernel.h"
pluginStatus_t
nmsInference
(
cudaStream_t
stream
,
const
int
N
,
const
int
perBatchBoxesSize
,
const
int
perBatchScoresSize
,
const
bool
shareLocation
,
const
int
backgroundLabelId
,
const
int
numPredsPerClass
,
const
int
numClasses
,
const
int
topK
,
const
int
keepTopK
,
const
float
scoreThreshold
,
const
float
iouThreshold
,
const
DataType
DT_BBOX
,
const
void
*
locData
,
const
DataType
DT_SCORE
,
const
void
*
confData
,
void
*
nmsedDets
,
void
*
nmsedLabels
,
void
*
nmsedIndex
,
void
*
workspace
,
bool
isNormalized
,
bool
confSigmoid
,
bool
clipBoxes
,
bool
rotated
=
false
);
#endif
Prev
1
…
6
7
8
9
10
11
12
13
14
…
23
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment