Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
onnxruntime_v14
Commits
1a91fcc2
"git@developer.sourcefind.cn:gaoqiong/migraphx.git" did not exist on "0d5aa0f1a06b70276a5d351280623fb2535f4f3c"
Commit
1a91fcc2
authored
Jul 25, 2023
by
gaoqiong
Browse files
add dtk所需文件
parent
a144865d
Pipeline
#492
failed with stages
in 0 seconds
Changes
280
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2152 additions
and
0 deletions
+2152
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/max_pool_with_index.cu
...onnxruntime/core/providers/rocm/nn/max_pool_with_index.cu
+187
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/max_pool_with_index.h
.../onnxruntime/core/providers/rocm/nn/max_pool_with_index.h
+24
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/pool.cc
...Release/amdgpu/onnxruntime/core/providers/rocm/nn/pool.cc
+286
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/pool.h
.../Release/amdgpu/onnxruntime/core/providers/rocm/nn/pool.h
+30
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/shrink.cc
...lease/amdgpu/onnxruntime/core/providers/rocm/nn/shrink.cc
+54
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/shrink.h
...elease/amdgpu/onnxruntime/core/providers/rocm/nn/shrink.h
+34
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/shrink_impl.cu
.../amdgpu/onnxruntime/core/providers/rocm/nn/shrink_impl.cu
+82
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/shrink_impl.h
...e/amdgpu/onnxruntime/core/providers/rocm/nn/shrink_impl.h
+19
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nvtx_profile.cc
...se/amdgpu/onnxruntime/core/providers/rocm/nvtx_profile.cc
+65
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nvtx_profile.h
...ase/amdgpu/onnxruntime/core/providers/rocm/nvtx_profile.h
+141
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nvtx_profile_context.h
...pu/onnxruntime/core/providers/rocm/nvtx_profile_context.h
+53
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/object_detection/non_max_suppression.cc
...re/providers/rocm/object_detection/non_max_suppression.cc
+138
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/object_detection/non_max_suppression.h
...ore/providers/rocm/object_detection/non_max_suppression.h
+23
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/object_detection/non_max_suppression_impl.cu
...oviders/rocm/object_detection/non_max_suppression_impl.cu
+446
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/object_detection/non_max_suppression_impl.h
...roviders/rocm/object_detection/non_max_suppression_impl.h
+27
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/object_detection/roialign.cc
...xruntime/core/providers/rocm/object_detection/roialign.cc
+78
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/object_detection/roialign.h
...nxruntime/core/providers/rocm/object_detection/roialign.h
+23
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/object_detection/roialign_impl.cu
...ime/core/providers/rocm/object_detection/roialign_impl.cu
+236
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/object_detection/roialign_impl.h
...time/core/providers/rocm/object_detection/roialign_impl.h
+31
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/reduction/reduction_functions.cc
...time/core/providers/rocm/reduction/reduction_functions.cc
+175
-0
No files found.
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/max_pool_with_index.cu
0 → 100644
View file @
1a91fcc2
#include "hip/hip_runtime.h"
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "max_pool_with_index.h"
#include <cfloat>
#include "core/providers/rocm/cu_inc/common.cuh"
#include "core/providers/rocm/shared_inc/fast_divmod.h"
namespace
onnxruntime
{
namespace
rocm
{
template
<
typename
T
>
__global__
void
MaxPoolWithIndexKernel
(
int64_t
batch
,
int64_t
channels
,
int64_t
height
,
int64_t
width
,
int64_t
depth
,
int64_t
pooled_height
,
int64_t
pooled_width
,
int64_t
pooled_depth
,
int64_t
kernel_h
,
int64_t
kernel_w
,
int64_t
kernel_d
,
int64_t
stride_h
,
int64_t
stride_w
,
int64_t
stride_d
,
int64_t
pad_h
,
int64_t
pad_w
,
int64_t
pad_d
,
int64_t
dilation_h
,
int64_t
dilation_w
,
int64_t
dilation_d
,
fast_divmod
fdm_c
,
fast_divmod
fdm_h
,
fast_divmod
fdm_w
,
fast_divmod
fdm_d
,
int64_t
storage_order
,
const
T
*
p_input
,
int64_t
output_size
,
T
*
p_output
,
int64_t
*
p_indices
)
{
int
id
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
id
>=
output_size
)
return
;
int
d_index
,
w_index
,
h_index
,
c_index
,
n_index
,
id_tmp
;
fdm_d
.
divmod
(
id
,
id_tmp
,
d_index
);
fdm_w
.
divmod
(
id_tmp
,
id_tmp
,
w_index
);
fdm_h
.
divmod
(
id_tmp
,
id_tmp
,
h_index
);
fdm_c
.
divmod
(
id_tmp
,
n_index
,
c_index
);
int64_t
d_start
=
d_index
*
stride_d
-
pad_d
;
int64_t
w_start
=
w_index
*
stride_w
-
pad_w
;
int64_t
h_start
=
h_index
*
stride_h
-
pad_h
;
int64_t
d_end
=
_Min
<
int64_t
>
(
d_start
+
(
kernel_d
-
1
)
*
dilation_d
+
1
,
depth
);
int64_t
w_end
=
_Min
<
int64_t
>
(
w_start
+
(
kernel_w
-
1
)
*
dilation_w
+
1
,
width
);
int64_t
h_end
=
_Min
<
int64_t
>
(
h_start
+
(
kernel_h
-
1
)
*
dilation_h
+
1
,
height
);
d_start
=
_Max
<
int64_t
>
(
d_start
,
0
);
w_start
=
_Max
<
int64_t
>
(
w_start
,
0
);
h_start
=
_Max
<
int64_t
>
(
h_start
,
0
);
int64_t
d_index_max
=
-
1
;
int64_t
w_index_max
=
-
1
;
int64_t
h_index_max
=
-
1
;
int64_t
offset
=
(
n_index
*
channels
+
c_index
)
*
height
*
width
*
depth
;
const
T
*
p_slice
=
p_input
+
offset
;
T
maxval
=
p_slice
[
h_start
*
width
*
depth
+
w_start
*
depth
+
d_start
]
-
(
T
)
1
;
for
(
int64_t
d
=
d_start
;
d
<
d_end
;
d
+=
dilation_d
)
{
for
(
int64_t
w
=
w_start
;
w
<
w_end
;
w
+=
dilation_w
)
{
for
(
int64_t
h
=
h_start
;
h
<
h_end
;
h
+=
dilation_h
)
{
if
(
p_slice
[
h
*
width
*
depth
+
w
*
depth
+
d
]
>
maxval
)
{
h_index_max
=
h
;
w_index_max
=
w
;
d_index_max
=
d
;
maxval
=
static_cast
<
float
>
(
p_slice
[
h
*
width
*
depth
+
w
*
depth
+
d
]);
}
}
}
}
p_output
[
id
]
=
p_input
[
offset
+
h_index_max
*
width
*
depth
+
w_index_max
*
depth
+
d_index_max
];
if
(
p_indices
)
{
p_indices
[
id
]
=
storage_order
==
0
?
offset
+
h_index_max
*
width
*
depth
+
w_index_max
*
depth
+
d_index_max
:
offset
+
h_index_max
+
w_index_max
*
height
+
d_index_max
*
width
*
height
;
}
}
template
<
typename
T
>
void
MaxPoolWithIndex
(
hipStream_t
stream
,
const
TensorShape
&
input_shape
,
const
TensorShape
&
output_shape
,
const
gsl
::
span
<
const
int64_t
>&
kernel_shape
,
const
gsl
::
span
<
const
int64_t
>&
stride_shape
,
const
gsl
::
span
<
const
int64_t
>&
pads
,
const
gsl
::
span
<
const
int64_t
>&
dilations
,
int64_t
storage_order
,
const
T
*
p_input
,
T
*
p_output
,
int64_t
*
p_indices
)
{
int64_t
batchs
=
input_shape
[
0
];
int64_t
channels
=
input_shape
[
1
];
int64_t
height
=
input_shape
[
2
];
int64_t
width
=
kernel_shape
.
size
()
>
1
?
input_shape
[
3
]
:
1
;
int64_t
depth
=
kernel_shape
.
size
()
>
2
?
input_shape
[
4
]
:
1
;
int64_t
pooled_height
=
output_shape
[
2
];
int64_t
pooled_width
=
kernel_shape
.
size
()
>
1
?
output_shape
[
3
]
:
1
;
int64_t
pooled_depth
=
kernel_shape
.
size
()
>
2
?
output_shape
[
4
]
:
1
;
int64_t
kernel_h
=
kernel_shape
[
0
];
int64_t
kernel_w
=
kernel_shape
.
size
()
>
1
?
kernel_shape
[
1
]
:
1
;
int64_t
kernel_d
=
kernel_shape
.
size
()
>
2
?
kernel_shape
[
2
]
:
1
;
int64_t
stride_h
=
stride_shape
[
0
];
int64_t
stride_w
=
stride_shape
.
size
()
>
1
?
stride_shape
[
1
]
:
1
;
int64_t
stride_d
=
stride_shape
.
size
()
>
2
?
stride_shape
[
2
]
:
1
;
//pads in the format of [x1_begin, x2_begin...x1_end, x2_end,...],
//where xi_begin the number of pixels added at the beginning of axis i
//and xi_end, the number of pixels added at the end of axis i.
int64_t
pad_h
=
pads
[
0
];
int64_t
pad_w
=
pads
.
size
()
>=
4
?
pads
[
1
]
:
0
;
int64_t
pad_d
=
pads
.
size
()
==
6
?
pads
[
2
]
:
0
;
int64_t
dilation_h
=
dilations
[
0
];
int64_t
dilation_w
=
dilations
.
size
()
>=
2
?
dilations
[
1
]
:
1
;
int64_t
dilation_d
=
dilations
.
size
()
==
3
?
dilations
[
2
]
:
1
;
int64_t
output_size
=
output_shape
.
Size
();
fast_divmod
fdm_c
(
static_cast
<
int
>
(
channels
));
fast_divmod
fdm_h
(
static_cast
<
int
>
(
pooled_height
));
fast_divmod
fdm_w
(
static_cast
<
int
>
(
pooled_width
));
fast_divmod
fdm_d
(
static_cast
<
int
>
(
pooled_depth
));
int
blocksPerGrid
=
(
int
)((
output_size
+
GridDim
::
maxThreadsPerBlock
-
1
)
/
GridDim
::
maxThreadsPerBlock
);
hipLaunchKernelGGL
(
MaxPoolWithIndexKernel
,
blocksPerGrid
,
GridDim
::
maxThreadsPerBlock
,
0
,
stream
,
batchs
,
channels
,
height
,
width
,
depth
,
pooled_height
,
pooled_width
,
pooled_depth
,
kernel_h
,
kernel_w
,
kernel_d
,
stride_h
,
stride_w
,
stride_d
,
pad_h
,
pad_w
,
pad_d
,
dilation_h
,
dilation_w
,
dilation_d
,
fdm_c
,
fdm_h
,
fdm_w
,
fdm_d
,
storage_order
,
p_input
,
output_size
,
p_output
,
p_indices
);
}
#define INSTANTIATEMAXPOOLWITHINDEX(T) \
template void MaxPoolWithIndex<T>( \
hipStream_t stream, \
const TensorShape& input_shape, \
const TensorShape& output_shape, \
const gsl::span<const int64_t>& kernel_shape, \
const gsl::span<const int64_t>& stride_shape, \
const gsl::span<const int64_t>& pads, \
const gsl::span<const int64_t>& dilations, \
int64_t storage_order, \
const T* p_input, \
T* p_output, \
int64_t* p_indices);
INSTANTIATEMAXPOOLWITHINDEX
(
float
)
INSTANTIATEMAXPOOLWITHINDEX
(
double
)
INSTANTIATEMAXPOOLWITHINDEX
(
half
)
INSTANTIATEMAXPOOLWITHINDEX
(
int8_t
)
INSTANTIATEMAXPOOLWITHINDEX
(
uint8_t
)
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/max_pool_with_index.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include <vector>
#include "core/framework/tensor_shape.h"
namespace
onnxruntime
{
namespace
rocm
{
template
<
typename
T
>
void
MaxPoolWithIndex
(
hipStream_t
stream
,
const
TensorShape
&
input_shape
,
const
TensorShape
&
output_shape
,
const
gsl
::
span
<
const
int64_t
>&
kernel_shape
,
const
gsl
::
span
<
const
int64_t
>&
stride_shape
,
const
gsl
::
span
<
const
int64_t
>&
pads
,
const
gsl
::
span
<
const
int64_t
>&
dilations
,
int64_t
storage_order
,
const
T
*
p_input
,
T
*
p_output
,
int64_t
*
p_indices
);
}
//namespace rocm
}
//namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/pool.cc
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/shared_library/provider_api.h"
#include "core/providers/rocm/nn/pool.h"
#include "core/providers/rocm/miopen_common.h"
#include "core/providers/rocm/nn/max_pool_with_index.h"
#include "core/providers/rocm/math/unary_elementwise_ops_impl.h"
using
namespace
onnxruntime
::
common
;
namespace
onnxruntime
{
namespace
rocm
{
#define POOLING_KERNEL(op_name, data_type, pool_type, since_version) \
ONNX_OPERATOR_TYPED_KERNEL_EX( \
op_name, \
kOnnxDomain, \
since_version, \
data_type, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<data_type>()), \
Pool<data_type, pool_type>);
#define POOLING_KERNEL_VERSIONED(op_name, data_type, pool_type, since_version, end_version) \
ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \
op_name, \
kOnnxDomain, \
since_version, \
end_version, \
data_type, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.TypeConstraint("T", DataTypeImpl::GetTensorType<data_type>()), \
Pool<data_type, pool_type>);
#define POOLING_KERNEL_WITH_INDICES(op_name, data_type, pool_type, since_version) \
ONNX_OPERATOR_TYPED_KERNEL_EX( \
op_name, \
kOnnxDomain, \
since_version, \
data_type, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.TypeConstraint("T", DataTypeImpl::GetTensorType<data_type>()) \
.TypeConstraint("I", DataTypeImpl::GetTensorType<int64_t>()), \
Pool<data_type, pool_type>);
#define POOLING_KERNEL_VERSIONED_WITH_INDICES(op_name, data_type, pool_type, since_version, end_version) \
ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \
op_name, \
kOnnxDomain, \
since_version, \
end_version, \
data_type, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.TypeConstraint("T", DataTypeImpl::GetTensorType<data_type>()) \
.TypeConstraint("I", DataTypeImpl::GetTensorType<int64_t>()), \
Pool<data_type, pool_type>);
POOLING_KERNEL_VERSIONED
(
AveragePool
,
float
,
AveragePool
,
7
,
9
)
POOLING_KERNEL_VERSIONED
(
AveragePool
,
double
,
AveragePool
,
7
,
9
)
POOLING_KERNEL_VERSIONED
(
AveragePool
,
MLFloat16
,
AveragePool
,
7
,
9
)
POOLING_KERNEL_VERSIONED
(
AveragePool
,
float
,
AveragePool
,
10
,
10
)
POOLING_KERNEL_VERSIONED
(
AveragePool
,
double
,
AveragePool
,
10
,
10
)
POOLING_KERNEL_VERSIONED
(
AveragePool
,
MLFloat16
,
AveragePool
,
10
,
10
)
//AveragePool and MaxPool op set 11 only update spec document on default value for dilations and strides.
POOLING_KERNEL
(
AveragePool
,
float
,
AveragePool
,
11
)
POOLING_KERNEL
(
AveragePool
,
double
,
AveragePool
,
11
)
POOLING_KERNEL
(
AveragePool
,
MLFloat16
,
AveragePool
,
11
)
POOLING_KERNEL
(
GlobalAveragePool
,
float
,
AveragePool
,
1
)
POOLING_KERNEL
(
GlobalAveragePool
,
double
,
AveragePool
,
1
)
POOLING_KERNEL
(
GlobalAveragePool
,
MLFloat16
,
AveragePool
,
1
)
POOLING_KERNEL_VERSIONED
(
MaxPool
,
float
,
MaxPool
<
1
>
,
1
,
7
)
POOLING_KERNEL_VERSIONED
(
MaxPool
,
double
,
MaxPool
<
1
>
,
1
,
7
)
POOLING_KERNEL_VERSIONED
(
MaxPool
,
MLFloat16
,
MaxPool
<
1
>
,
1
,
7
)
POOLING_KERNEL_VERSIONED_WITH_INDICES
(
MaxPool
,
float
,
MaxPool
<
8
>
,
8
,
9
)
POOLING_KERNEL_VERSIONED_WITH_INDICES
(
MaxPool
,
double
,
MaxPool
<
8
>
,
8
,
9
)
POOLING_KERNEL_VERSIONED_WITH_INDICES
(
MaxPool
,
MLFloat16
,
MaxPool
<
8
>
,
8
,
9
)
POOLING_KERNEL_VERSIONED_WITH_INDICES
(
MaxPool
,
float
,
MaxPool
<
8
>
,
10
,
10
)
POOLING_KERNEL_VERSIONED_WITH_INDICES
(
MaxPool
,
double
,
MaxPool
<
8
>
,
10
,
10
)
POOLING_KERNEL_VERSIONED_WITH_INDICES
(
MaxPool
,
MLFloat16
,
MaxPool
<
8
>
,
10
,
10
)
POOLING_KERNEL_VERSIONED_WITH_INDICES
(
MaxPool
,
float
,
MaxPool
<
8
>
,
11
,
11
)
POOLING_KERNEL_VERSIONED_WITH_INDICES
(
MaxPool
,
double
,
MaxPool
<
8
>
,
11
,
11
)
POOLING_KERNEL_VERSIONED_WITH_INDICES
(
MaxPool
,
MLFloat16
,
MaxPool
<
8
>
,
11
,
11
)
POOLING_KERNEL_WITH_INDICES
(
MaxPool
,
float
,
MaxPool
<
8
>
,
12
)
POOLING_KERNEL_WITH_INDICES
(
MaxPool
,
double
,
MaxPool
<
8
>
,
12
)
POOLING_KERNEL_WITH_INDICES
(
MaxPool
,
MLFloat16
,
MaxPool
<
8
>
,
12
)
POOLING_KERNEL_WITH_INDICES
(
MaxPool
,
int8_t
,
MaxPool
<
8
>
,
12
)
POOLING_KERNEL_WITH_INDICES
(
MaxPool
,
uint8_t
,
MaxPool
<
8
>
,
12
)
POOLING_KERNEL
(
GlobalMaxPool
,
float
,
MaxPool
<
1
>
,
1
)
POOLING_KERNEL
(
GlobalMaxPool
,
double
,
MaxPool
<
1
>
,
1
)
POOLING_KERNEL
(
GlobalMaxPool
,
MLFloat16
,
MaxPool
<
1
>
,
1
)
class
MiopenPoolingDescriptor
final
{
public:
MiopenPoolingDescriptor
()
:
desc_
(
nullptr
)
{
}
~
MiopenPoolingDescriptor
()
{
if
(
desc_
!=
nullptr
)
{
miopenDestroyPoolingDescriptor
(
desc_
);
desc_
=
nullptr
;
}
}
MiopenPoolingDescriptor
(
const
MiopenPoolingDescriptor
&
)
=
delete
;
MiopenPoolingDescriptor
&
operator
=
(
const
MiopenPoolingDescriptor
&
)
=
delete
;
Status
Set
(
miopenPoolingMode_t
mode
,
const
gsl
::
span
<
const
int64_t
>&
kernel_shape
,
const
gsl
::
span
<
const
int64_t
>&
pads
,
const
gsl
::
span
<
const
int64_t
>&
strides
)
{
if
(
!
desc_
)
MIOPEN_RETURN_IF_ERROR
(
miopenCreatePoolingDescriptor
(
&
desc_
));
int
rank
=
gsl
::
narrow_cast
<
int
>
(
kernel_shape
.
size
());
InlinedVector
<
int
>
window
(
rank
);
InlinedVector
<
int
>
padding
(
rank
);
InlinedVector
<
int
>
stride
(
rank
);
for
(
int
i
=
0
;
i
<
rank
;
i
++
)
{
window
[
i
]
=
gsl
::
narrow_cast
<
int
>
(
kernel_shape
[
i
]);
}
for
(
int
i
=
0
;
i
<
rank
;
i
++
)
{
padding
[
i
]
=
gsl
::
narrow_cast
<
int
>
(
pads
[
i
]);
}
for
(
int
i
=
0
;
i
<
rank
;
i
++
)
{
stride
[
i
]
=
gsl
::
narrow_cast
<
int
>
(
strides
[
i
]);
}
MIOPEN_RETURN_IF_ERROR
(
SetPoolingNdDescriptorHelper
(
desc_
,
mode
,
MIOPEN_PROPAGATE_NAN
,
rank
,
window
.
data
(),
padding
.
data
(),
stride
.
data
()));
return
Status
::
OK
();
}
operator
miopenPoolingDescriptor_t
()
const
{
return
desc_
;
}
private:
miopenPoolingDescriptor_t
desc_
;
};
template
<
typename
T
,
typename
PoolType
>
Status
Pool
<
T
,
PoolType
>::
ComputeInternal
(
OpKernelContext
*
context
)
const
{
typedef
typename
ToHipType
<
T
>::
MappedType
HipT
;
const
Tensor
*
X
=
context
->
Input
<
Tensor
>
(
0
);
const
TensorShape
&
x_shape
=
X
->
Shape
();
const
auto
x_dims
=
x_shape
.
GetDims
();
if
(
x_shape
.
NumDimensions
()
<
3
)
{
return
ORT_MAKE_STATUS
(
ONNXRUNTIME
,
FAIL
,
"Input dimension cannot be less than 3."
);
}
auto
kernel_shape
=
pool_attrs_
.
kernel_shape
;
auto
pads
=
pool_attrs_
.
pads
;
auto
strides
=
pool_attrs_
.
strides
;
if
(
pool_attrs_
.
global_pooling
)
{
kernel_shape
.
assign
(
x_dims
.
begin
()
+
2
,
x_dims
.
end
());
pads
.
assign
(
kernel_shape
.
size
(),
0
);
strides
.
assign
(
kernel_shape
.
size
(),
1
);
}
auto
y_dims
=
pool_attrs_
.
SetOutputSize
(
x_shape
,
x_shape
[
1
],
&
pads
);
TensorShape
y_shape
(
y_dims
);
Tensor
*
Y
=
context
->
Output
(
0
,
y_shape
);
// special case when there is a dim value of 0 in the shape.
if
(
y_shape
.
Size
()
==
0
)
return
Status
::
OK
();
auto
x_data
=
reinterpret_cast
<
const
HipT
*>
(
X
->
Data
<
T
>
());
auto
y_data
=
reinterpret_cast
<
HipT
*>
(
Y
->
MutableData
<
T
>
());
TensorShapeVector
x_dims_miopen
(
x_dims
.
begin
(),
x_dims
.
end
());
TensorShapeVector
y_dims_miopen
(
y_dims
);
if
(
kernel_shape
.
size
()
<
2
)
{
// miopen only takes 4D or 5D input, so pad dimensions if needed
x_dims_miopen
.
push_back
(
1
);
y_dims_miopen
.
push_back
(
1
);
pads
.
insert
(
pads
.
begin
()
+
kernel_shape
.
size
(),
0
);
pads
.
insert
(
pads
.
end
(),
0
);
kernel_shape
.
push_back
(
1
);
strides
.
push_back
(
1
);
}
miopenPoolingMode_t
mode
=
miopenPoolingMax
;
if
constexpr
(
PoolType
::
type
==
onnxruntime
::
PoolType
::
kAveragePool
)
{
mode
=
pool_attrs_
.
count_include_pad
?
miopenPoolingAverageInclusive
:
miopenPoolingAverage
;
}
MiopenPoolingDescriptor
pooling_desc
;
ORT_RETURN_IF_ERROR
(
pooling_desc
.
Set
(
mode
,
kernel_shape
,
pads
,
strides
));
if
constexpr
(
std
::
is_same
<
T
,
int8_t
>::
value
||
std
::
is_same
<
T
,
uint8_t
>::
value
)
{
// Cast to float back and forth using temp buffer
const
auto
alpha
=
Consts
<
float
>::
One
;
const
auto
beta
=
Consts
<
float
>::
Zero
;
MiopenTensor
x_tensor
;
MiopenTensor
y_tensor
;
ORT_RETURN_IF_ERROR
(
x_tensor
.
Set
(
x_dims_miopen
,
MiopenTensor
::
GetDataType
<
float
>
()));
ORT_RETURN_IF_ERROR
(
y_tensor
.
Set
(
y_dims_miopen
,
MiopenTensor
::
GetDataType
<
float
>
()));
const
auto
input_count
=
x_shape
.
Size
();
const
auto
output_count
=
y_shape
.
Size
();
IAllocatorUniquePtr
<
float
>
temp_X
=
GetScratchBuffer
<
float
>
(
input_count
);
auto
temp_Y
=
GetScratchBuffer
<
float
>
(
output_count
);
Impl_Cast
<
HipT
,
float
>
(
Stream
(),
reinterpret_cast
<
const
HipT
*>
(
x_data
),
temp_X
.
get
(),
input_count
);
MIOPEN_RETURN_IF_ERROR
(
PoolingForwardHelper
(
MiopenHandle
(),
pooling_desc
,
&
alpha
,
x_tensor
,
temp_X
.
get
(),
&
beta
,
y_tensor
,
temp_Y
.
get
()));
Impl_Cast
<
float
,
HipT
>
(
Stream
(),
temp_Y
.
get
(),
y_data
,
output_count
);
}
else
{
const
auto
alpha
=
Consts
<
HipT
>::
One
;
const
auto
beta
=
Consts
<
HipT
>::
Zero
;
MiopenTensor
x_tensor
;
MiopenTensor
y_tensor
;
ORT_RETURN_IF_ERROR
(
x_tensor
.
Set
(
x_dims_miopen
,
MiopenTensor
::
GetDataType
<
HipT
>
()));
ORT_RETURN_IF_ERROR
(
y_tensor
.
Set
(
y_dims_miopen
,
MiopenTensor
::
GetDataType
<
HipT
>
()));
MIOPEN_RETURN_IF_ERROR
(
PoolingForwardHelper
(
MiopenHandle
(),
pooling_desc
,
&
alpha
,
x_tensor
,
x_data
,
&
beta
,
y_tensor
,
y_data
));
}
return
Status
::
OK
();
}
template
<
typename
T
>
Status
Pool
<
T
,
MaxPool
<
8
>>::
ComputeInternal
(
OpKernelContext
*
context
)
const
{
typedef
typename
ToHipType
<
T
>::
MappedType
HipT
;
const
Tensor
*
X
=
context
->
Input
<
Tensor
>
(
0
);
const
TensorShape
&
x_shape
=
X
->
Shape
();
const
auto
&
x_dims
=
x_shape
.
GetDims
();
if
(
x_shape
.
NumDimensions
()
<
3
)
{
return
ORT_MAKE_STATUS
(
ONNXRUNTIME
,
FAIL
,
"Input dimension cannot be less than 3."
);
}
auto
kernel_shape
=
this
->
pool_attrs_
.
kernel_shape
;
auto
pads
=
this
->
pool_attrs_
.
pads
;
auto
strides
=
this
->
pool_attrs_
.
strides
;
if
(
this
->
pool_attrs_
.
global_pooling
)
{
kernel_shape
.
assign
(
x_dims
.
begin
()
+
2
,
x_dims
.
end
());
pads
.
assign
(
kernel_shape
.
size
(),
0
);
strides
.
assign
(
kernel_shape
.
size
(),
1
);
}
auto
y_dims
=
this
->
pool_attrs_
.
SetOutputSize
(
x_shape
,
x_shape
[
1
],
&
pads
);
Tensor
*
Y
=
context
->
Output
(
0
,
TensorShape
(
y_dims
));
// special case when there is a dim value of 0 in the shape.
if
(
Y
->
Shape
().
Size
()
==
0
)
return
Status
::
OK
();
auto
x_data
=
reinterpret_cast
<
const
HipT
*>
(
X
->
Data
<
T
>
());
auto
y_data
=
reinterpret_cast
<
HipT
*>
(
Y
->
MutableData
<
T
>
());
Tensor
*
I
=
context
->
Output
(
1
,
TensorShape
(
y_dims
));
if
(
nullptr
!=
I
||
!
this
->
pool_attrs_
.
default_dilations
)
{
auto
i_data
=
nullptr
==
I
?
nullptr
:
I
->
MutableData
<
int64_t
>
();
MaxPoolWithIndex
<
HipT
>
(
this
->
Stream
(),
x_shape
,
TensorShape
(
y_dims
),
kernel_shape
,
strides
,
pads
,
this
->
pool_attrs_
.
dilations
,
this
->
pool_attrs_
.
storage_order
,
x_data
,
y_data
,
i_data
);
}
else
{
ORT_RETURN_IF_ERROR
((
Pool
<
T
,
MaxPool
<
1
>>::
ComputeInternal
(
context
)));
}
return
Status
::
OK
();
}
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/pool.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/rocm/rocm_kernel.h"
#include "core/providers/rocm/miopen_common.h"
#include "core/providers/cpu/nn/pool_base.h"
namespace
onnxruntime
{
namespace
rocm
{
template
<
typename
T
,
typename
PoolType
>
class
Pool
:
public
RocmKernel
,
public
PoolBase
{
public:
Pool
(
const
OpKernelInfo
&
info
)
:
RocmKernel
(
info
),
PoolBase
(
info
)
{}
Status
ComputeInternal
(
OpKernelContext
*
context
)
const
override
;
};
template
<
typename
T
>
class
Pool
<
T
,
MaxPool
<
8
>>
final
:
public
Pool
<
T
,
MaxPool
<
1
>>
{
public:
Pool
(
const
OpKernelInfo
&
info
)
:
Pool
<
T
,
MaxPool
<
1
>>
(
info
)
{}
Status
ComputeInternal
(
OpKernelContext
*
context
)
const
override
;
};
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/shrink.cc
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "shrink.h"
#include "shrink_impl.h"
#include "core/providers/common.h"
using
namespace
std
;
namespace
onnxruntime
{
namespace
rocm
{
#define SHRINK_REGISTER_KERNEL(T) \
ONNX_OPERATOR_TYPED_KERNEL_EX( \
Shrink, \
kOnnxDomain, \
9, \
T, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.MayInplace(0, 0) \
.TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
Shrink<T>);
template
<
typename
T
>
Status
Shrink
<
T
>::
ComputeInternal
(
OpKernelContext
*
p_op_kernel_context
)
const
{
typedef
typename
ToHipType
<
T
>::
MappedType
HipT
;
const
Tensor
*
X
=
p_op_kernel_context
->
Input
<
Tensor
>
(
0
);
const
auto
*
x_data
=
reinterpret_cast
<
const
HipT
*>
(
X
->
Data
<
T
>
());
const
TensorShape
&
x_shape
=
X
->
Shape
();
const
size_t
x_size
=
static_cast
<
size_t
>
(
x_shape
.
Size
());
Tensor
*
Y
=
p_op_kernel_context
->
Output
(
0
,
x_shape
);
auto
*
y_data
=
reinterpret_cast
<
HipT
*>
(
Y
->
MutableData
<
T
>
());
ShrinkImpl
<
HipT
>
(
Stream
(),
x_data
,
bias_
,
lambd_
,
y_data
,
x_size
);
return
Status
::
OK
();
}
SHRINK_REGISTER_KERNEL
(
float
)
SHRINK_REGISTER_KERNEL
(
double
)
SHRINK_REGISTER_KERNEL
(
MLFloat16
)
SHRINK_REGISTER_KERNEL
(
uint8_t
)
SHRINK_REGISTER_KERNEL
(
int8_t
)
SHRINK_REGISTER_KERNEL
(
uint16_t
)
SHRINK_REGISTER_KERNEL
(
int16_t
)
SHRINK_REGISTER_KERNEL
(
uint32_t
)
SHRINK_REGISTER_KERNEL
(
int32_t
)
SHRINK_REGISTER_KERNEL
(
uint64_t
)
SHRINK_REGISTER_KERNEL
(
int64_t
)
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/shrink.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/rocm/rocm_kernel.h"
namespace
onnxruntime
{
namespace
rocm
{
template
<
typename
T
>
class
Shrink
final
:
public
RocmKernel
{
public:
Shrink
(
const
OpKernelInfo
&
info
)
:
RocmKernel
(
info
)
{
float
bias_temp
;
// if the attribute exists, use the value
if
(
info
.
GetAttr
<
float
>
(
"bias"
,
&
bias_temp
).
IsOK
())
bias_
=
bias_temp
;
float
lambd_temp
;
// if the attribute exists, use the value
if
(
info
.
GetAttr
<
float
>
(
"lambd"
,
&
lambd_temp
).
IsOK
())
lambd_
=
lambd_temp
;
}
Status
ComputeInternal
(
OpKernelContext
*
p_op_kernel_context
)
const
;
private:
float
bias_
=
0.0
f
;
// default as per spec
float
lambd_
=
0.5
f
;
// default as per spec
};
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/shrink_impl.cu
0 → 100644
View file @
1a91fcc2
#include "hip/hip_runtime.h"
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/cu_inc/common.cuh"
#include "shrink_impl.h"
namespace
onnxruntime
{
namespace
rocm
{
// Generic implementation of Shrink
template
<
typename
T
>
__global__
void
_ShrinkKernel
(
const
T
*
input_data
,
const
float
bias
,
const
float
lambda
,
T
*
output_data
,
const
HIP_LONG
N
)
{
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT
(
id
,
N
);
T
x
=
input_data
[
id
];
if
(
x
<
-
lambda
)
{
output_data
[
id
]
=
(
T
)(
x
+
bias
);
}
else
if
(
x
>
lambda
)
{
output_data
[
id
]
=
(
T
)(
x
-
bias
);
}
else
{
output_data
[
id
]
=
(
T
)
0
;
}
}
// Specialized implementation for 'half' type
// the idea is to convert 'half' data to 'float' first,
// do the operation and convert result back to 'half'
template
<
>
__global__
void
_ShrinkKernel
(
const
half
*
input_data
,
const
float
bias
,
const
float
lambda
,
half
*
output_data
,
const
HIP_LONG
N
)
{
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT
(
id
,
N
);
half
x
=
input_data
[
id
];
if
((
float
)
x
<
-
lambda
)
{
output_data
[
id
]
=
half
((
float
)
x
+
bias
);
}
else
if
((
float
)
x
>
lambda
)
{
output_data
[
id
]
=
half
((
float
)
x
-
bias
);
}
else
{
output_data
[
id
]
=
(
half
)
0
;
}
}
template
<
typename
T
>
void
ShrinkImpl
(
hipStream_t
stream
,
const
T
*
input_data
,
const
float
bias
,
const
float
lambda
,
T
*
output_data
,
size_t
N
)
{
int
blocksPerGrid
=
(
int
)(
ceil
(
static_cast
<
float
>
(
N
)
/
GridDim
::
maxThreadsPerBlock
));
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
_ShrinkKernel
<
T
>
),
blocksPerGrid
,
GridDim
::
maxThreadsPerBlock
,
0
,
stream
,
input_data
,
bias
,
lambda
,
output_data
,
(
HIP_LONG
)
N
);
}
#define SPECIALIZED_IMPL(T) \
template void ShrinkImpl<T>(hipStream_t stream, const T* input_data, const float bias, const float lambda, T* output_data, size_t N);
SPECIALIZED_IMPL
(
float
)
SPECIALIZED_IMPL
(
double
)
SPECIALIZED_IMPL
(
half
)
SPECIALIZED_IMPL
(
uint8_t
)
SPECIALIZED_IMPL
(
int8_t
)
SPECIALIZED_IMPL
(
uint16_t
)
SPECIALIZED_IMPL
(
int16_t
)
SPECIALIZED_IMPL
(
uint32_t
)
SPECIALIZED_IMPL
(
int32_t
)
SPECIALIZED_IMPL
(
uint64_t
)
SPECIALIZED_IMPL
(
int64_t
)
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/shrink_impl.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
namespace
onnxruntime
{
namespace
rocm
{
template
<
typename
T
>
void
ShrinkImpl
(
hipStream_t
stream
,
const
T
*
input_data
,
const
float
bias
,
const
float
lambda
,
T
*
output_data
,
size_t
count
);
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nvtx_profile.cc
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#ifdef ENABLE_NVTX_PROFILE
#include "nvtx_profile.h"
#include "core/common/common.h"
#include <nvToolsExt.h>
#include <nvToolsExtCuda.h>
namespace
onnxruntime
{
namespace
profile
{
void
NvtxRangeCreator
::
BeginImpl
()
{
// enable only for debug builds because this function is for profiling only.
nvtxEventAttributes_t
eventAttrib
;
eventAttrib
.
version
=
NVTX_VERSION
;
eventAttrib
.
size
=
NVTX_EVENT_ATTRIB_STRUCT_SIZE
;
eventAttrib
.
colorType
=
NVTX_COLOR_ARGB
;
eventAttrib
.
color
=
static_cast
<
uint32_t
>
(
color_
);
eventAttrib
.
messageType
=
NVTX_MESSAGE_TYPE_ASCII
;
eventAttrib
.
message
.
ascii
=
message_
.
c_str
();
range_id_
=
nvtxRangeStartEx
(
&
eventAttrib
);
}
void
NvtxRangeCreator
::
EndImpl
()
{
// enable only for debug builds because this function is for profiling only.
nvtxRangeEnd
(
range_id_
);
}
void
NvtxNestedRangeCreator
::
BeginImpl
()
{
// enable only for debug builds because this function is for profiling only.
nvtxEventAttributes_t
eventAttrib
;
eventAttrib
.
version
=
NVTX_VERSION
;
eventAttrib
.
size
=
NVTX_EVENT_ATTRIB_STRUCT_SIZE
;
eventAttrib
.
colorType
=
NVTX_COLOR_ARGB
;
eventAttrib
.
color
=
static_cast
<
uint32_t
>
(
color_
);
eventAttrib
.
messageType
=
NVTX_MESSAGE_TYPE_ASCII
;
eventAttrib
.
message
.
ascii
=
message_
.
c_str
();
nvtxRangePushEx
(
&
eventAttrib
);
}
void
NvtxNestedRangeCreator
::
EndImpl
()
{
// enable only for debug builds because this function is for profiling only.
nvtxRangePop
();
}
void
NvtxMarkerCreator
::
Mark
()
{
// enable only for debug builds because this function is for profiling only.
nvtxEventAttributes_t
eventAttrib
;
eventAttrib
.
version
=
NVTX_VERSION
;
eventAttrib
.
size
=
NVTX_EVENT_ATTRIB_STRUCT_SIZE
;
eventAttrib
.
colorType
=
NVTX_COLOR_ARGB
;
eventAttrib
.
color
=
static_cast
<
uint32_t
>
(
color_
);
eventAttrib
.
messageType
=
NVTX_MESSAGE_TYPE_ASCII
;
eventAttrib
.
message
.
ascii
=
message_
.
c_str
();
nvtxMarkEx
(
&
eventAttrib
);
}
}
// namespace contrib
}
// namespace onnxruntime
#endif
\ No newline at end of file
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nvtx_profile.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
// These enclosed classes are wrappers over
// generating Nvidia's visual profile APIs.
// They can be used to plot the time intervals of forward and backward passes.
// They can also be used to plot the time span of a specific operator.
// When writing this file, Nvidia only supports this tool on Linux.
#ifdef ENABLE_NVTX_PROFILE
#pragma once
#include <cinttypes>
#include <cstdlib>
#include <iostream>
#include <stdexcept>
#include <string>
#include "core/common/common.h"
namespace
onnxruntime
{
namespace
profile
{
// Color in ARGB space.
// A: first 8 bit.
// R: later 8 bit.
// G: later 8 bit.
// B: last 8 bits
// All colo channels has range [0, 255].
enum
class
Color
:
uint32_t
{
Black
=
0x00000000
,
Red
=
0x00ff0000
,
DarkGreen
=
0x00009900
,
Green
=
0x0000ff00
,
LightGreen
=
0x00ccffcc
,
Blue
=
0x000000ff
,
Amber
=
0x00ffbf00
,
LightAmber
=
0x00fff2cc
,
White
=
0x00ffffff
,
Cyan
=
0x0000ffff
,
Magenta
=
0x00ff00ff
,
Yellow
=
0x00ffff00
,
};
class
RangeCreatorBase
{
public:
RangeCreatorBase
(
const
std
::
string
message
,
const
Color
color
)
:
message_
(
message
),
color_
(
color
),
is_begin_called_
(
false
),
is_end_called_
(
false
)
{};
// Check if Begin and End are both called.
// It's pointless if not all of them are called.
~
RangeCreatorBase
()
{
if
(
!
is_begin_called_
)
{
std
::
cerr
<<
"Begin must be called once."
<<
std
::
endl
;
}
if
(
!
is_end_called_
)
{
std
::
cerr
<<
"End must be called once."
<<
std
::
endl
;
}
}
// Mark the beginning of a range.
void
Begin
()
{
ORT_ENFORCE
(
!
is_begin_called_
,
"Begin cannot be called more than once."
);
ORT_ENFORCE
(
!
is_end_called_
,
"Begin cannot be called after calling End."
);
BeginImpl
();
is_begin_called_
=
true
;
}
// Mark the end of a range.
void
End
()
{
ORT_ENFORCE
(
is_begin_called_
,
"End must be called after calling Begin."
);
ORT_ENFORCE
(
!
is_end_called_
,
"End cannot be called more than once."
);
EndImpl
();
is_end_called_
=
true
;
}
bool
IsBeginCalled
()
const
{
return
is_begin_called_
;
}
bool
IsEndCalled
()
const
{
return
is_end_called_
;
}
virtual
void
BeginImpl
()
=
0
;
virtual
void
EndImpl
()
=
0
;
protected:
// Text on this event.
const
std
::
string
message_
;
// Color of event in ARGB space.
const
Color
color_
;
bool
is_begin_called_
;
bool
is_end_called_
;
};
class
NvtxRangeCreator
final
:
public
RangeCreatorBase
{
public:
NvtxRangeCreator
(
const
std
::
string
message
,
const
Color
color
)
:
RangeCreatorBase
(
message
,
color
)
{};
void
BeginImpl
()
override
;
void
EndImpl
()
override
;
private:
// It records the event ID created by BeginImpl.
// EndImpl needs this value to end the right event.
uint64_t
range_id_
;
};
class
NvtxNestedRangeCreator
final
:
public
RangeCreatorBase
{
public:
NvtxNestedRangeCreator
(
const
std
::
string
message
,
const
Color
color
)
:
RangeCreatorBase
(
message
,
color
)
{};
void
BeginImpl
()
override
;
void
EndImpl
()
override
;
};
class
NvtxMarkerCreator
final
{
public:
NvtxMarkerCreator
(
const
std
::
string
message
,
const
Color
color
)
:
message_
(
message
),
color_
(
color
)
{};
void
Mark
();
private:
// Text on this marker.
const
std
::
string
message_
;
// See nvtxRangeCreator.color_.
const
Color
color_
;
};
}
// namespace profile
}
// namespace onnxruntime
#endif
\ No newline at end of file
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nvtx_profile_context.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <thread>
#include <string>
#include <unordered_map>
#include "core/platform/ort_mutex.h"
#ifdef ENABLE_NVTX_PROFILE
namespace
onnxruntime
{
namespace
profile
{
// Singleton class of managing global NVTX profiling information.
class
Context
{
public:
static
Context
&
GetInstance
()
{
static
Context
instance_
;
return
instance_
;
}
// Return tag for the specified thread.
// If the thread's tag doesn't exist, this function returns an empty string.
std
::
string
GetThreadTagOrDefault
(
const
std
::
thread
::
id
&
thread_id
)
{
const
std
::
lock_guard
<
OrtMutex
>
lock
(
mtx_
);
return
thread_tag_
[
thread_id
];
}
// Set tag for the specified thread.
void
SetThreadTag
(
const
std
::
thread
::
id
&
thread_id
,
const
std
::
string
&
tag
)
{
const
std
::
lock_guard
<
OrtMutex
>
lock
(
mtx_
);
thread_tag_
[
thread_id
]
=
tag
;
}
private:
Context
()
=
default
;
~
Context
()
=
default
;
Context
(
const
Context
&
)
=
delete
;
Context
&
operator
=
(
const
Context
&
)
=
delete
;
// map from thread's id to its human-readable tag.
std
::
unordered_map
<
std
::
thread
::
id
,
std
::
string
>
thread_tag_
;
OrtMutex
mtx_
;
};
}
// namespace profile
}
// namespace onnxruntime
#endif
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/object_detection/non_max_suppression.cc
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "non_max_suppression.h"
#include "core/providers/cpu/object_detection/non_max_suppression_helper.h"
#include "non_max_suppression_impl.h"
#include "core/providers/rocm/tensor/concat_impl.h"
namespace
onnxruntime
{
namespace
rocm
{
ONNX_OPERATOR_VERSIONED_KERNEL_EX
(
NonMaxSuppression
,
kOnnxDomain
,
10
,
10
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
.
InputMemoryType
(
OrtMemTypeCPUInput
,
2
)
.
InputMemoryType
(
OrtMemTypeCPUInput
,
3
)
.
InputMemoryType
(
OrtMemTypeCPUInput
,
4
),
NonMaxSuppression
);
ONNX_OPERATOR_KERNEL_EX
(
NonMaxSuppression
,
kOnnxDomain
,
11
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
.
InputMemoryType
(
OrtMemTypeCPUInput
,
2
)
.
InputMemoryType
(
OrtMemTypeCPUInput
,
3
)
.
InputMemoryType
(
OrtMemTypeCPUInput
,
4
),
NonMaxSuppression
);
Status
NonMaxSuppression
::
ComputeInternal
(
OpKernelContext
*
ctx
)
const
{
PrepareContext
pc
;
ORT_RETURN_IF_ERROR
(
PrepareCompute
(
ctx
,
pc
));
int64_t
max_output_boxes_per_class
=
0
;
float
iou_threshold
=
.0
f
;
float
score_threshold
=
.0
f
;
ORT_RETURN_IF_ERROR
(
GetThresholdsFromInputs
(
pc
,
max_output_boxes_per_class
,
iou_threshold
,
score_threshold
));
if
(
0
==
pc
.
num_boxes_
||
0
==
max_output_boxes_per_class
)
{
ctx
->
Output
(
0
,
{
0
,
3
});
return
Status
::
OK
();
}
// TODO: use hipcub::DeviceSegmentedRadixSort::SortPairsDescending instead of hipcub::DeviceRadixSort::SortPairsDescending
// to deal with multi batch/class parallelly
std
::
vector
<
std
::
tuple
<
IAllocatorUniquePtr
<
void
>
,
int
>>
all_selected_indices
;
int
total_num_saved_outputs
=
0
;
// safe downcast max_output_boxes_per_class to int as hipcub::DeviceSelect::Flagged() does not support int64_t
int
int_max_output_boxes_per_class
=
max_output_boxes_per_class
>
std
::
numeric_limits
<
int
>::
max
()
?
std
::
numeric_limits
<
int
>::
max
()
:
static_cast
<
int
>
(
max_output_boxes_per_class
);
for
(
int64_t
batch_index
=
0
;
batch_index
<
pc
.
num_batches_
;
++
batch_index
)
{
for
(
int64_t
class_index
=
0
;
class_index
<
pc
.
num_classes_
;
++
class_index
)
{
IAllocatorUniquePtr
<
void
>
d_selected_indices
{};
IAllocatorUniquePtr
<
void
>
h_number_selected_ptr
{
AllocateBufferOnCPUPinned
<
void
>
(
sizeof
(
int
))};
auto
*
h_number_selected
=
static_cast
<
int
*>
(
h_number_selected_ptr
.
get
());
ORT_RETURN_IF_ERROR
(
NonMaxSuppressionImpl
(
Stream
(),
[
this
](
size_t
bytes
)
{
return
GetScratchBuffer
<
void
>
(
bytes
);
},
pc
,
GetCenterPointBox
(),
batch_index
,
class_index
,
int_max_output_boxes_per_class
,
iou_threshold
,
score_threshold
,
d_selected_indices
,
h_number_selected
));
int
num_saved_outputs
=
*
h_number_selected
;
if
(
num_saved_outputs
>
0
)
{
all_selected_indices
.
emplace_back
(
std
::
move
(
d_selected_indices
),
num_saved_outputs
);
total_num_saved_outputs
+=
num_saved_outputs
;
}
}
}
if
(
total_num_saved_outputs
==
0
)
{
ctx
->
Output
(
0
,
{
0
,
3
});
}
else
{
// concatenate outputs
constexpr
int
last_dim
=
3
;
const
int
num_elements
=
last_dim
*
total_num_saved_outputs
;
Tensor
*
output
=
ctx
->
Output
(
0
,
{
static_cast
<
int64_t
>
(
total_num_saved_outputs
),
last_dim
});
ORT_ENFORCE
(
output
!=
nullptr
);
int64_t
*
dst
=
output
->
MutableData
<
int64_t
>
();
size_t
count
=
all_selected_indices
.
size
();
RocmAsyncBuffer
<
const
void
*>
input_ptr
(
this
,
count
);
RocmAsyncBuffer
<
int64_t
>
concat_sizes_gpu
(
this
,
count
);
RocmAsyncBuffer
<
int64_t
>
concat_sizes_range_gpu
(
this
,
count
);
RocmAsyncBuffer
<
int64_t
>
axis_dimension_input_output_mapping_gpu
(
this
,
total_num_saved_outputs
);
int
index
=
0
;
for
(
size_t
i
=
0
;
i
<
count
;
i
++
)
{
auto
&
it
=
all_selected_indices
[
i
];
auto
src
=
std
::
get
<
0
>
(
it
).
get
();
auto
size
=
std
::
get
<
1
>
(
it
);
input_ptr
.
CpuPtr
()[
i
]
=
src
;
concat_sizes_gpu
.
CpuPtr
()[
i
]
=
size
;
concat_sizes_range_gpu
.
CpuPtr
()[
i
]
=
(
i
==
0
)
?
size
:
size
+
concat_sizes_range_gpu
.
CpuPtr
()[
i
-
1
];
for
(
int
j
=
0
;
j
<
size
;
j
++
)
{
axis_dimension_input_output_mapping_gpu
.
CpuPtr
()[
index
++
]
=
i
;
}
}
ORT_RETURN_IF_ERROR
(
concat_sizes_gpu
.
CopyToGpu
());
ORT_RETURN_IF_ERROR
(
axis_dimension_input_output_mapping_gpu
.
CopyToGpu
());
ORT_RETURN_IF_ERROR
(
concat_sizes_range_gpu
.
CopyToGpu
());
ORT_RETURN_IF_ERROR
(
input_ptr
.
CopyToGpu
());
ORT_RETURN_IF_ERROR
(
ConcatImpl
(
Stream
(),
sizeof
(
int64_t
),
num_elements
,
last_dim
,
concat_sizes_gpu
.
GpuPtr
(),
concat_sizes_range_gpu
.
GpuPtr
(),
axis_dimension_input_output_mapping_gpu
.
GpuPtr
(),
dst
,
input_ptr
.
GpuPtr
(),
static_cast
<
size_t
>
(
num_elements
)));
}
return
Status
::
OK
();
}
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/object_detection/non_max_suppression.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/common/common.h"
#include "core/providers/rocm/rocm_kernel.h"
#include "core/providers/cpu/object_detection/non_max_suppression.h"
namespace
onnxruntime
{
namespace
rocm
{
struct
NonMaxSuppression
final
:
public
RocmKernel
,
public
NonMaxSuppressionBase
{
explicit
NonMaxSuppression
(
const
OpKernelInfo
&
info
)
:
RocmKernel
(
info
),
NonMaxSuppressionBase
(
info
)
{
}
Status
ComputeInternal
(
OpKernelContext
*
context
)
const
override
;
private:
ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE
(
NonMaxSuppression
);
};
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/object_detection/non_max_suppression_impl.cu
0 → 100644
View file @
1a91fcc2
#include "hip/hip_runtime.h"
/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
/* Modifications Copyright (c) Microsoft. */
#include <thrust/count.h>
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include "non_max_suppression_impl.h"
#include "core/providers/cpu/object_detection/non_max_suppression_helper.h"
#include "core/providers/rocm/cu_inc/common.cuh"
#include "core/providers/rocm/rocm_common.h"
#include <hipcub/hipcub.hpp>
//TODO:fix the warnings
#ifdef _MSC_VER
#pragma warning(disable : 4244)
#endif
namespace
onnxruntime
{
namespace
rocm
{
using
namespace
nms_helpers
;
namespace
{
struct
__align__
(
16
)
Box
{
float
x1
,
y1
,
x2
,
y2
;
};
// This is the width of the bitmask for masking boxes for each thread.
// This needs to be a multiple of 2(a POD width usually) so that division and
// modulo can be implemented as bit operations during host selection.
constexpr
int
kNmsBoxesPerThread
=
8
*
sizeof
(
int
);
// Helper to calculate modulo mask and shift bits.
// For kNmsBoxesPerThread=32 ModuloMask will be 31, i.e 0x1F thus
// i % 32 == i & 31. Similarly ShiftBits will be 5 so that
// i / 32 == i >> 5. Using these bit operations should reduce the stall on host
// thread.
__device__
constexpr
int
NumBits
(
int
n
)
{
return
(
n
==
0
)
?
0
:
NumBits
(
n
>>
1
)
+
1
;
}
constexpr
int
kNmsBlockDim
=
16
;
constexpr
int
kNmsBlockDimMax
=
128
;
// Check whether two boxes have an IoU greater than threshold.
template
<
typename
T
>
__device__
inline
bool
OverThreshold
(
const
Box
*
a
,
const
Box
*
b
,
const
float
a_area
,
const
T
iou_threshold
)
{
const
float
b_area
=
(
b
->
x2
-
b
->
x1
)
*
(
b
->
y2
-
b
->
y1
);
if
(
a_area
==
0.0
f
||
b_area
==
0.0
f
)
return
false
;
const
float
xx1
=
fmaxf
(
a
->
x1
,
b
->
x1
);
const
float
yy1
=
fmaxf
(
a
->
y1
,
b
->
y1
);
const
float
xx2
=
fminf
(
a
->
x2
,
b
->
x2
);
const
float
yy2
=
fminf
(
a
->
y2
,
b
->
y2
);
// fdimf computes the positive difference between xx2+1 and xx1.
const
float
w
=
fdimf
(
xx2
,
xx1
);
const
float
h
=
fdimf
(
yy2
,
yy1
);
const
float
intersection
=
w
*
h
;
// Testing for aa/bb > t
// eq with aa > bb*t (b is !=0)
// avoiding divisions.
const
float
aa
=
intersection
;
const
float
bb
=
a_area
+
b_area
-
intersection
;
const
float
bt
=
bb
*
iou_threshold
;
return
aa
>=
bt
;
}
template
<
typename
T
>
__device__
inline
bool
CheckBit
(
T
*
bit_mask
,
int
bit
)
{
constexpr
int
kShiftLen
=
NumBits
(
8
*
sizeof
(
T
))
-
1
;
constexpr
int
kRemainderMask
=
8
*
sizeof
(
T
)
-
1
;
int
bin
=
bit
>>
kShiftLen
;
return
(
bit_mask
[
bin
]
>>
(
bit
&
kRemainderMask
))
&
1
;
}
// Produce a global bitmask (result_mask) of selected boxes from bitmask
// generated by NMSKernel Abort early if max_boxes boxes are selected. Bitmask
// is num_boxes*bit_mask_len bits indicating whether to keep or remove a box.
__global__
void
NMSReduce
(
const
int
*
bitmask
,
const
int
bit_mask_len
,
const
int
num_boxes
,
const
int
max_boxes
,
char
*
result_mask
)
{
extern
__shared__
int
local
[];
// set global mask to accept all boxes
for
(
int
box
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
box
<
bit_mask_len
;
box
+=
blockDim
.
x
*
gridDim
.
x
)
{
local
[
box
]
=
0xFFFFFFFF
;
}
__syncthreads
();
int
accepted_boxes
=
0
;
for
(
int
box
=
0
;
box
<
num_boxes
-
1
;
++
box
)
{
// if current box is masked by an earlier box, skip it.
if
(
!
CheckBit
(
local
,
box
))
{
continue
;
}
accepted_boxes
+=
1
;
int
offset
=
box
*
bit_mask_len
;
// update global mask with current box's mask
for
(
int
b
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
b
<
bit_mask_len
;
b
+=
blockDim
.
x
*
gridDim
.
x
)
{
local
[
b
]
&=
~
bitmask
[
offset
+
b
];
}
__syncthreads
();
if
(
accepted_boxes
>
max_boxes
)
break
;
}
// copy global mask to result_max char array. char array is needed for
// hipcub::DeviceSelect later.
for
(
int
box
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
box
<
num_boxes
;
box
+=
blockDim
.
x
*
gridDim
.
x
)
{
result_mask
[
box
]
=
CheckBit
(
local
,
box
);
}
}
// For each box, compute a bitmask of boxes which has an overlap with given box
// above threshold.
//
// Starting from highes scoring box, mark any box which has IoU>threshold with
// given box. Each thread processes a kNmsBoxesPerThread boxes per stride, and
// each box has bitmask of overlaps of length bit_mask_len.
//
__launch_bounds__
(
kNmsBlockDim
*
kNmsBlockDim
,
4
)
__global__
void
NMSKernel
(
const
int64_t
center_point_box
,
const
Box
*
d_desc_sorted_boxes
,
const
int
num_boxes
,
const
float
iou_threshold
,
const
int
bit_mask_len
,
int
*
d_delete_mask
)
{
for
(
int
i_block_offset
=
blockIdx
.
x
*
blockDim
.
x
;
i_block_offset
<
num_boxes
;
i_block_offset
+=
blockDim
.
x
*
gridDim
.
x
)
{
const
int
i
=
i_block_offset
+
threadIdx
.
x
;
if
(
i
<
num_boxes
)
{
for
(
int
j_thread_offset
=
kNmsBoxesPerThread
*
(
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
);
j_thread_offset
<
num_boxes
;
j_thread_offset
+=
kNmsBoxesPerThread
*
blockDim
.
y
*
gridDim
.
y
)
{
// Note : We can do everything using multiplication,
// and use fp16 - we are comparing against a low precision
// threshold.
int
above_threshold
=
0
;
// Make sure that threads are within valid domain.
bool
valid
=
false
;
// Loop over the next kNmsBoxesPerThread boxes and set corresponding bit
// if it is overlapping with current box
for
(
int
ib
=
0
;
ib
<
kNmsBoxesPerThread
;
++
ib
)
{
// This thread will compare Box i and Box j.
const
int
j
=
j_thread_offset
+
ib
;
if
(
i
>=
j
||
i
>=
num_boxes
||
j
>=
num_boxes
)
continue
;
valid
=
true
;
if
(
SuppressByIOU
(
reinterpret_cast
<
const
float
*>
(
d_desc_sorted_boxes
),
i
,
j
,
center_point_box
,
iou_threshold
))
{
// we have score[j] <= score[i].
above_threshold
|=
(
1U
<<
ib
);
}
}
if
(
valid
)
{
d_delete_mask
[
i
*
bit_mask_len
+
j_thread_offset
/
kNmsBoxesPerThread
]
=
above_threshold
;
}
}
}
}
}
// Variadic template helpers for Index selecting multiple arrays at the same
// time
template
<
typename
Index
>
__device__
inline
void
SelectHelper
(
const
Index
/*i_selected */
,
const
Index
/* i_original */
)
{}
template
<
typename
Index
,
typename
T
,
typename
...
Args
>
__device__
inline
void
SelectHelper
(
const
Index
i_selected
,
const
Index
i_original
,
const
T
*
original
,
T
*
selected
,
Args
...
args
)
{
selected
[
i_selected
]
=
original
[
i_original
];
SelectHelper
(
i_selected
,
i_original
,
args
...);
}
// Helper template to select elements from original arrays using the index
// mapping and store into selected array. Each array sharing same mapping need
// to be passed as pairs of pointers to original and selected arrays. For
// selecting 2 arrays call would be
// IndexMultiSelect(num_elements, indices, original1 ,selected1, original2,
// selected2).
template
<
typename
Index
,
typename
T
,
typename
...
Args
>
__global__
void
IndexMultiSelect
(
const
int
num_elements
,
const
Index
*
indices
,
const
T
*
original
,
T
*
selected
,
Args
...
args
)
{
for
(
int
idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
idx
<
num_elements
;
idx
+=
blockDim
.
x
*
gridDim
.
x
)
{
SelectHelper
(
idx
,
indices
[
idx
],
original
,
selected
,
args
...);
}
}
template
<
typename
T
>
__global__
void
SetZero
(
const
int
count
,
T
*
__restrict__
ptr
)
{
// Check that the grid is one dimensional and index doesn't overflow.
assert
(
blockDim
.
y
==
1
);
assert
(
blockDim
.
z
==
1
);
assert
(
blockDim
.
x
*
gridDim
.
x
/
blockDim
.
x
==
gridDim
.
x
);
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
count
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
ptr
[
i
]
=
T
(
0
);
}
}
template
<
typename
T
>
__global__
void
Iota
(
const
int
num_elements
,
const
T
offset
,
T
*
to_fill
)
{
for
(
int
idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
idx
<
num_elements
;
idx
+=
blockDim
.
x
*
gridDim
.
x
)
{
to_fill
[
idx
]
=
static_cast
<
T
>
(
idx
)
+
offset
;
}
}
__global__
void
NormalizeOutput
(
const
int
num_elements
,
const
int
*
original
,
int64_t
*
to_normalize
,
int64_t
batch_index
,
int64_t
class_index
)
{
for
(
int
idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
idx
<
num_elements
;
idx
+=
blockDim
.
x
*
gridDim
.
x
)
{
to_normalize
[
idx
*
3
]
=
batch_index
;
to_normalize
[
idx
*
3
+
1
]
=
class_index
;
to_normalize
[
idx
*
3
+
2
]
=
static_cast
<
int64_t
>
(
original
[
idx
]);
}
}
Status
NmsGpu
(
hipStream_t
stream
,
std
::
function
<
IAllocatorUniquePtr
<
void
>
(
size_t
)
>
allocator
,
const
int64_t
center_point_box
,
const
float
*
d_sorted_boxes_float_ptr
,
const
int
num_boxes
,
const
float
iou_threshold
,
int
*
d_selected_indices
,
int
*
h_nkeep
,
const
int
max_boxes
)
{
// Making sure we respect the __align(16)__
// we promised to the compiler.
auto
iptr
=
reinterpret_cast
<
std
::
uintptr_t
>
(
d_sorted_boxes_float_ptr
);
ORT_ENFORCE
((
iptr
&
15
)
==
0
);
const
int
bit_mask_len
=
(
num_boxes
+
kNmsBoxesPerThread
-
1
)
/
kNmsBoxesPerThread
;
int
max_nms_mask_size
=
num_boxes
*
bit_mask_len
;
IAllocatorUniquePtr
<
void
>
d_nms_mask_ptr
{
allocator
(
max_nms_mask_size
*
sizeof
(
int
))};
auto
*
d_nms_mask
=
static_cast
<
int
*>
(
d_nms_mask_ptr
.
get
());
int
blocksPerGrid
=
(
int
)(
ceil
(
static_cast
<
float
>
(
max_nms_mask_size
)
/
GridDim
::
maxThreadsPerBlock
));
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
SetZero
<
int
>
),
blocksPerGrid
,
GridDim
::
maxThreadsPerBlock
,
0
,
stream
,
max_nms_mask_size
,
d_nms_mask
);
int
*
d_delete_mask
=
d_nms_mask
;
int
*
h_selected_count
=
h_nkeep
;
const
Box
*
d_sorted_boxes
=
reinterpret_cast
<
const
Box
*>
(
d_sorted_boxes_float_ptr
);
dim3
block_dim
,
thread_block
;
int
num_blocks
=
(
num_boxes
+
kNmsBlockDim
-
1
)
/
kNmsBlockDim
;
num_blocks
=
std
::
max
(
std
::
min
(
num_blocks
,
kNmsBlockDimMax
),
1
);
block_dim
.
x
=
num_blocks
;
block_dim
.
y
=
num_blocks
;
block_dim
.
z
=
1
;
thread_block
.
x
=
kNmsBlockDim
;
thread_block
.
y
=
kNmsBlockDim
;
thread_block
.
z
=
1
;
hipLaunchKernelGGL
(
NMSKernel
,
block_dim
,
thread_block
,
0
,
stream
,
center_point_box
,
d_sorted_boxes
,
num_boxes
,
iou_threshold
,
bit_mask_len
,
d_delete_mask
);
IAllocatorUniquePtr
<
void
>
d_selected_boxes_ptr
{
allocator
(
num_boxes
*
sizeof
(
char
))};
auto
*
d_selected_boxes
=
static_cast
<
char
*>
(
d_selected_boxes_ptr
.
get
());
IAllocatorUniquePtr
<
void
>
d_indices_ptr
{
allocator
(
num_boxes
*
sizeof
(
int
))};
auto
*
d_indices
=
static_cast
<
int
*>
(
d_indices_ptr
.
get
());
blocksPerGrid
=
(
int
)(
ceil
(
static_cast
<
float
>
(
num_boxes
)
/
GridDim
::
maxThreadsPerBlock
));
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
Iota
<
int
>
),
blocksPerGrid
,
GridDim
::
maxThreadsPerBlock
,
0
,
stream
,
num_boxes
,
0
,
d_indices
);
NMSReduce
<<<
1
,
1024
,
bit_mask_len
*
sizeof
(
int
),
stream
>>>
(
d_delete_mask
,
bit_mask_len
,
num_boxes
,
max_boxes
,
d_selected_boxes
);
size_t
flagged_buffer_size
=
0
;
HIP_RETURN_IF_ERROR
(
hipcub
::
DeviceSelect
::
Flagged
(
static_cast
<
void
*>
(
nullptr
),
// temp_storage
flagged_buffer_size
,
static_cast
<
int
*>
(
nullptr
),
// input
static_cast
<
char
*>
(
nullptr
),
// selection flag
static_cast
<
int
*>
(
nullptr
),
// selected items
static_cast
<
int
*>
(
nullptr
),
// num_selected
num_boxes
,
stream
));
IAllocatorUniquePtr
<
void
>
d_cub_scratch_buffer_ptr
{
allocator
(
flagged_buffer_size
)};
auto
*
d_cub_scratch_buffer
=
static_cast
<
uint8_t
*>
(
d_cub_scratch_buffer_ptr
.
get
());
IAllocatorUniquePtr
<
void
>
d_num_selected_ptr
{
allocator
(
sizeof
(
int
))};
auto
*
d_num_selected
=
static_cast
<
int
*>
(
d_num_selected_ptr
.
get
());
HIP_RETURN_IF_ERROR
(
hipcub
::
DeviceSelect
::
Flagged
(
d_cub_scratch_buffer
,
// temp_storage
flagged_buffer_size
,
d_indices
,
// input
d_selected_boxes
,
// selection flag
d_selected_indices
,
// selected items
d_num_selected
,
num_boxes
,
stream
));
HIP_RETURN_IF_ERROR
(
hipMemcpyAsync
(
h_selected_count
,
d_num_selected
,
sizeof
(
int
),
hipMemcpyDeviceToHost
,
stream
));
// hipStreamSynchronize is needed since the value of h_selected_count will be used by host after this function.
HIP_RETURN_IF_ERROR
(
hipStreamSynchronize
(
stream
));
return
Status
::
OK
();
}
struct
DeviceGreaterThan
{
float
threshold_
;
__host__
__device__
__forceinline__
DeviceGreaterThan
(
float
threshold
)
:
threshold_
(
threshold
)
{}
__host__
__device__
__forceinline__
bool
operator
()(
const
float
&
val
)
const
{
return
(
val
>
threshold_
);
}
};
}
// namespace
Status
NonMaxSuppressionImpl
(
hipStream_t
stream
,
std
::
function
<
IAllocatorUniquePtr
<
void
>
(
size_t
)
>
allocator
,
const
PrepareContext
&
pc
,
const
int64_t
center_point_box
,
int64_t
batch_index
,
int64_t
class_index
,
int
max_output_boxes_per_class
,
float
iou_threshold
,
float
score_threshold
,
IAllocatorUniquePtr
<
void
>&
selected_indices
,
int
*
h_number_selected
)
{
// STEP 1. Prepare data
int
num_boxes
=
pc
.
num_boxes_
;
const
float
*
boxes_data
=
pc
.
boxes_data_
+
batch_index
*
num_boxes
*
4
;
const
float
*
scores_data
=
pc
.
scores_data_
+
(
batch_index
*
pc
.
num_classes_
+
class_index
)
*
num_boxes
;
// prepare temporary memory for sorting scores
// calculate temporary size that used for sorting
size_t
cub_sort_temp_storage_bytes
=
0
;
HIP_RETURN_IF_ERROR
(
hipcub
::
DeviceRadixSort
::
SortPairsDescending
(
nullptr
,
cub_sort_temp_storage_bytes
,
static_cast
<
float
*>
(
nullptr
),
// scores
static_cast
<
float
*>
(
nullptr
),
// sorted scores
static_cast
<
int
*>
(
nullptr
),
// input indices
static_cast
<
int
*>
(
nullptr
),
// sorted indices
num_boxes
,
// num items
0
,
8
*
sizeof
(
float
),
// sort all bits
stream
));
// allocate temporary memory
IAllocatorUniquePtr
<
void
>
d_cub_sort_buffer_ptr
{
allocator
(
cub_sort_temp_storage_bytes
)};
auto
*
d_cub_sort_buffer
=
static_cast
<
uint8_t
*>
(
d_cub_sort_buffer_ptr
.
get
());
IAllocatorUniquePtr
<
void
>
d_indices_ptr
{
allocator
(
num_boxes
*
sizeof
(
int
))};
auto
*
d_indices
=
static_cast
<
int
*>
(
d_indices_ptr
.
get
());
IAllocatorUniquePtr
<
void
>
d_sorted_indices_ptr
{
allocator
(
num_boxes
*
sizeof
(
int
))};
auto
*
d_sorted_indices
=
static_cast
<
int
*>
(
d_sorted_indices_ptr
.
get
());
IAllocatorUniquePtr
<
void
>
d_selected_indices_ptr
{
allocator
(
num_boxes
*
sizeof
(
int
))};
auto
*
d_selected_indices
=
static_cast
<
int
*>
(
d_selected_indices_ptr
.
get
());
IAllocatorUniquePtr
<
void
>
d_sorted_scores_ptr
{
allocator
(
num_boxes
*
sizeof
(
float
))};
auto
*
d_sorted_scores
=
static_cast
<
float
*>
(
d_sorted_scores_ptr
.
get
());
IAllocatorUniquePtr
<
void
>
d_sorted_boxes_ptr
{
allocator
(
num_boxes
*
4
*
sizeof
(
float
))};
auto
*
d_sorted_boxes
=
static_cast
<
float
*>
(
d_sorted_boxes_ptr
.
get
());
// create sequense of indices
int
blocksPerGrid
=
(
int
)(
ceil
(
static_cast
<
float
>
(
num_boxes
)
/
GridDim
::
maxThreadsPerBlock
));
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
Iota
<
int
>
),
blocksPerGrid
,
GridDim
::
maxThreadsPerBlock
,
0
,
stream
,
num_boxes
,
0
,
d_indices
);
HIP_RETURN_IF_ERROR
(
hipGetLastError
());
// sort scores
HIP_RETURN_IF_ERROR
(
hipcub
::
DeviceRadixSort
::
SortPairsDescending
(
d_cub_sort_buffer
,
cub_sort_temp_storage_bytes
,
scores_data
,
d_sorted_scores
,
d_indices
,
d_sorted_indices
,
num_boxes
,
0
,
8
*
sizeof
(
float
),
// sort all bits
stream
));
// pick sorted scores
const
Box
*
original_boxes
=
reinterpret_cast
<
const
Box
*>
(
boxes_data
);
Box
*
sorted_boxes
=
reinterpret_cast
<
Box
*>
(
d_sorted_boxes
);
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
IndexMultiSelect
<
int
,
Box
>
),
blocksPerGrid
,
GridDim
::
maxThreadsPerBlock
,
0
,
stream
,
num_boxes
,
d_sorted_indices
,
original_boxes
,
sorted_boxes
);
HIP_RETURN_IF_ERROR
(
hipGetLastError
());
// STEP 2. filter boxes by scores
int
limited_num_boxes
=
num_boxes
;
if
(
pc
.
score_threshold_
!=
nullptr
)
{
thrust
::
device_ptr
<
float
>
sorted_scores_device_ptr
(
d_sorted_scores
);
limited_num_boxes
=
thrust
::
count_if
(
thrust
::
hip
::
par
.
on
(
stream
),
sorted_scores_device_ptr
,
sorted_scores_device_ptr
+
num_boxes
,
DeviceGreaterThan
(
score_threshold
));
HIP_RETURN_IF_ERROR
(
hipGetLastError
());
if
(
limited_num_boxes
==
0
)
{
*
h_number_selected
=
0
;
return
Status
::
OK
();
}
}
// STEP 3. launch NMS kernels
ORT_RETURN_IF_ERROR
(
NmsGpu
(
stream
,
allocator
,
center_point_box
,
d_sorted_boxes
,
limited_num_boxes
,
iou_threshold
,
d_selected_indices
,
h_number_selected
,
max_output_boxes_per_class
));
HIP_RETURN_IF_ERROR
(
hipGetLastError
());
// STEP 4. map back to sorted indices
*
h_number_selected
=
std
::
min
(
*
h_number_selected
,
max_output_boxes_per_class
);
int
num_to_keep
=
*
h_number_selected
;
if
(
num_to_keep
>
0
)
{
IAllocatorUniquePtr
<
void
>
d_output_indices_ptr
{
allocator
(
num_to_keep
*
sizeof
(
int
))};
auto
*
d_output_indices
=
static_cast
<
int
*>
(
d_output_indices_ptr
.
get
());
IAllocatorUniquePtr
<
void
>
d_normalized_output_indices_ptr
{
allocator
(
num_to_keep
*
3
*
sizeof
(
int64_t
))};
auto
*
d_normalized_output_indices
=
static_cast
<
int64_t
*>
(
d_normalized_output_indices_ptr
.
get
());
blocksPerGrid
=
(
int
)(
ceil
(
static_cast
<
float
>
(
num_to_keep
)
/
GridDim
::
maxThreadsPerBlock
));
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
IndexMultiSelect
<
int
,
int
>
),
blocksPerGrid
,
GridDim
::
maxThreadsPerBlock
,
0
,
stream
,
num_to_keep
,
d_selected_indices
,
d_sorted_indices
,
d_output_indices
);
hipLaunchKernelGGL
(
NormalizeOutput
,
blocksPerGrid
,
GridDim
::
maxThreadsPerBlock
,
0
,
stream
,
num_to_keep
,
d_output_indices
,
d_normalized_output_indices
,
batch_index
,
class_index
);
HIP_RETURN_IF_ERROR
(
hipGetLastError
());
selected_indices
=
std
::
move
(
d_normalized_output_indices_ptr
);
}
return
Status
::
OK
();
}
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/object_detection/non_max_suppression_impl.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <stdint.h>
#include "core/providers/rocm/shared_inc/rocm_utils.h"
#include <functional>
#include "core/providers/cpu/object_detection/non_max_suppression_helper.h"
namespace
onnxruntime
{
namespace
rocm
{
Status
NonMaxSuppressionImpl
(
hipStream_t
stream
,
std
::
function
<
IAllocatorUniquePtr
<
void
>
(
size_t
)
>
allocator
,
const
PrepareContext
&
pc
,
const
int64_t
center_point_box
,
int64_t
batch_index
,
int64_t
class_index
,
int
max_output_boxes_per_class
,
float
iou_threshold
,
float
score_threshold
,
IAllocatorUniquePtr
<
void
>&
selected_indices
,
int
*
h_number_selected
);
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/object_detection/roialign.cc
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "roialign.h"
#include "roialign_impl.h"
namespace
onnxruntime
{
namespace
rocm
{
#define REGISTER_KERNEL_TYPED(T) \
ONNX_OPERATOR_TYPED_KERNEL_EX( \
RoiAlign, \
kOnnxDomain, \
10, \
T, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.TypeConstraint("T1", DataTypeImpl::GetTensorType<T>()) \
.TypeConstraint("T2", DataTypeImpl::GetTensorType<int64_t>()), \
RoiAlign<T>);
template
<
typename
T
>
Status
RoiAlign
<
T
>::
ComputeInternal
(
OpKernelContext
*
context
)
const
{
// X
const
auto
*
X_ptr
=
context
->
Input
<
Tensor
>
(
0
);
// rois
const
auto
*
rois_ptr
=
context
->
Input
<
Tensor
>
(
1
);
// batch indices
const
auto
*
batch_indices_ptr
=
context
->
Input
<
Tensor
>
(
2
);
const
auto
&
x_dims
=
X_ptr
->
Shape
();
const
auto
&
rois_dims
=
rois_ptr
->
Shape
();
const
auto
&
batch_indices_dims
=
batch_indices_ptr
->
Shape
();
auto
num_rois
=
batch_indices_dims
[
0
];
auto
num_roi_cols
=
rois_dims
[
1
];
auto
status
=
CheckROIAlignValidInput
(
X_ptr
,
rois_ptr
,
batch_indices_ptr
);
if
(
status
!=
Status
::
OK
())
{
return
status
;
}
Tensor
&
Y
=
*
context
->
Output
(
0
,
{
num_rois
,
x_dims
[
1
],
this
->
output_height_
,
this
->
output_width_
});
int64_t
output_size
=
Y
.
Shape
().
Size
();
if
(
output_size
>
0
)
{
RoiAlignImpl
(
Stream
(),
output_size
,
// num threads
reinterpret_cast
<
const
typename
ToHipType
<
T
>::
MappedType
*>
(
X_ptr
->
Data
<
T
>
()),
ToHipType
<
T
>::
FromFloat
(
this
->
spatial_scale_
),
x_dims
[
1
],
// num channels
x_dims
[
2
],
// height
x_dims
[
3
],
// width
this
->
output_height_
,
this
->
output_width_
,
this
->
sampling_ratio_
,
reinterpret_cast
<
const
typename
ToHipType
<
T
>::
MappedType
*>
(
rois_ptr
->
Data
<
T
>
()),
num_roi_cols
,
reinterpret_cast
<
typename
ToHipType
<
T
>::
MappedType
*>
(
Y
.
MutableData
<
T
>
()),
this
->
mode_
==
RoiAlignMode
::
avg
,
this
->
half_pixel_
,
batch_indices_ptr
->
Data
<
int64_t
>
());
}
return
Status
::
OK
();
}
#define SPECIALIZED_COMPUTE(T) \
REGISTER_KERNEL_TYPED(T) \
template Status RoiAlign<T>::ComputeInternal(OpKernelContext* ctx) const;
SPECIALIZED_COMPUTE
(
float
)
SPECIALIZED_COMPUTE
(
double
)
//SPECIALIZED_COMPUTE(MLFloat16)
}
// namespace rocm
};
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/object_detection/roialign.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/common/common.h"
#include "core/providers/rocm/rocm_kernel.h"
#include "core/providers/cpu/object_detection/roialign.h"
namespace
onnxruntime
{
namespace
rocm
{
template
<
typename
T
>
struct
RoiAlign
final
:
RocmKernel
,
RoiAlignBase
{
RoiAlign
(
const
OpKernelInfo
&
info
)
:
RocmKernel
(
info
),
RoiAlignBase
(
info
)
{}
Status
ComputeInternal
(
OpKernelContext
*
context
)
const
override
;
private:
ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE
(
RoiAlign
);
};
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/object_detection/roialign_impl.cu
0 → 100644
View file @
1a91fcc2
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* Modifications Copyright (c) Microsoft. */
#include "roialign_impl.h"
#include "core/providers/rocm/cu_inc/common.cuh"
namespace
onnxruntime
{
namespace
rocm
{
template
<
typename
T
>
__device__
T
bilinear_interpolate
(
const
T
*
bottom_data
,
const
int
height
,
const
int
width
,
T
y
,
T
x
,
const
bool
is_mode_avg
,
const
int
index
/* index for debug only*/
)
{
// deal with cases that inverse elements are out of feature map boundary
if
(
y
<
-
1.0
||
y
>
height
||
x
<
-
1.0
||
x
>
width
)
{
// empty
return
0
;
}
if
(
y
<=
0
)
{
y
=
0
;
}
if
(
x
<=
0
)
{
x
=
0
;
}
int
y_low
=
(
int
)
y
;
int
x_low
=
(
int
)
x
;
int
y_high
;
int
x_high
;
if
(
y_low
>=
height
-
1
)
{
y_high
=
y_low
=
height
-
1
;
y
=
(
T
)
y_low
;
}
else
{
y_high
=
y_low
+
1
;
}
if
(
x_low
>=
width
-
1
)
{
x_high
=
x_low
=
width
-
1
;
x
=
(
T
)
x_low
;
}
else
{
x_high
=
x_low
+
1
;
}
T
ly
=
y
-
y_low
;
T
lx
=
x
-
x_low
;
T
hy
=
1.
-
ly
,
hx
=
1.
-
lx
;
// do bilinear interpolation
T
v1
=
bottom_data
[
y_low
*
width
+
x_low
];
T
v2
=
bottom_data
[
y_low
*
width
+
x_high
];
T
v3
=
bottom_data
[
y_high
*
width
+
x_low
];
T
v4
=
bottom_data
[
y_high
*
width
+
x_high
];
T
w1
=
hy
*
hx
,
w2
=
hy
*
lx
,
w3
=
ly
*
hx
,
w4
=
ly
*
lx
;
T
val
=
is_mode_avg
?
(
w1
*
v1
+
w2
*
v2
+
w3
*
v3
+
w4
*
v4
)
// mode Avg
:
max
(
max
(
max
(
w1
*
v1
,
w2
*
v2
),
w3
*
v3
),
w4
*
v4
);
// mode Max
return
val
;
}
template
<
typename
T
>
__global__
void
RoIAlignForward
(
const
int64_t
nthreads
,
const
T
*
bottom_data
,
const
T
spatial_scale
,
const
int64_t
channels
,
const
int64_t
height
,
const
int64_t
width
,
const
int64_t
pooled_height
,
const
int64_t
pooled_width
,
const
int64_t
sampling_ratio
,
const
T
*
bottom_rois
,
int64_t
roi_cols
,
T
*
top_data
,
const
bool
is_mode_avg
,
const
bool
half_pixel
,
const
int64_t
*
batch_indices_ptr
)
{
for
(
size_t
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
nthreads
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
// (n, c, ph, pw) is an element in the pooled output
int
pw
=
index
%
pooled_width
;
int
ph
=
(
index
/
pooled_width
)
%
pooled_height
;
int
c
=
(
index
/
pooled_width
/
pooled_height
)
%
channels
;
int
n
=
index
/
pooled_width
/
pooled_height
/
channels
;
// RoI could have 4 or 5 columns
const
T
*
offset_bottom_rois
=
bottom_rois
+
n
*
roi_cols
;
const
auto
roi_batch_ind
=
batch_indices_ptr
[
n
];
// Do not using rounding; this implementation detail is critical
T
roi_offset
=
half_pixel
?
T
(
0.5
)
:
T
(
0
);
T
roi_start_w
=
offset_bottom_rois
[
0
]
*
spatial_scale
-
roi_offset
;
T
roi_start_h
=
offset_bottom_rois
[
1
]
*
spatial_scale
-
roi_offset
;
T
roi_end_w
=
offset_bottom_rois
[
2
]
*
spatial_scale
-
roi_offset
;
T
roi_end_h
=
offset_bottom_rois
[
3
]
*
spatial_scale
-
roi_offset
;
T
roi_width
=
roi_end_w
-
roi_start_w
;
T
roi_height
=
roi_end_h
-
roi_start_h
;
if
(
!
half_pixel
)
{
// backward compatiblity
// Force malformed ROIs to be 1x1
roi_width
=
max
(
roi_width
,
(
T
)
1.
);
roi_height
=
max
(
roi_height
,
(
T
)
1.
);
}
T
bin_size_h
=
static_cast
<
T
>
(
roi_height
)
/
static_cast
<
T
>
(
pooled_height
);
T
bin_size_w
=
static_cast
<
T
>
(
roi_width
)
/
static_cast
<
T
>
(
pooled_width
);
const
T
*
offset_bottom_data
=
bottom_data
+
static_cast
<
int64_t
>
((
roi_batch_ind
*
channels
+
c
)
*
height
*
width
);
// We use roi_bin_grid to sample the grid and mimic integral
int
roi_bin_grid_h
=
(
sampling_ratio
>
0
)
?
sampling_ratio
:
_Ceil
(
roi_height
/
pooled_height
);
// e.g., = 2
int
roi_bin_grid_w
=
(
sampling_ratio
>
0
)
?
sampling_ratio
:
_Ceil
(
roi_width
/
pooled_width
);
// We do average (integral) pooling inside a bin
const
T
count
=
roi_bin_grid_h
*
roi_bin_grid_w
;
// e.g. = 4
T
output_val
=
0.
;
bool
max_flag
=
false
;
for
(
int
iy
=
0
;
iy
<
roi_bin_grid_h
;
iy
++
)
// e.g., iy = 0, 1
{
const
T
y
=
roi_start_h
+
ph
*
bin_size_h
+
static_cast
<
T
>
(
iy
+
.5
f
)
*
bin_size_h
/
static_cast
<
T
>
(
roi_bin_grid_h
);
// e.g., 0.5, 1.5
for
(
int
ix
=
0
;
ix
<
roi_bin_grid_w
;
ix
++
)
{
const
T
x
=
roi_start_w
+
pw
*
bin_size_w
+
static_cast
<
T
>
(
ix
+
.5
f
)
*
bin_size_w
/
static_cast
<
T
>
(
roi_bin_grid_w
);
T
val
=
bilinear_interpolate
(
offset_bottom_data
,
height
,
width
,
y
,
x
,
is_mode_avg
,
index
);
if
(
is_mode_avg
)
{
output_val
+=
val
;
}
else
{
if
(
!
max_flag
)
{
output_val
=
val
;
max_flag
=
true
;
}
else
{
output_val
=
max
(
output_val
,
val
);
}
}
}
}
if
(
is_mode_avg
)
{
output_val
/=
count
;
}
top_data
[
index
]
=
output_val
;
}
}
template
<
typename
T
>
void
RoiAlignImpl
(
hipStream_t
stream
,
const
int64_t
nthreads
,
const
T
*
bottom_data
,
const
T
spatial_scale
,
const
int64_t
channels
,
const
int64_t
height
,
const
int64_t
width
,
const
int64_t
pooled_height
,
const
int64_t
pooled_width
,
const
int64_t
sampling_ratio
,
const
T
*
bottom_rois
,
int64_t
roi_cols
,
T
*
top_data
,
const
bool
is_mode_avg
,
const
bool
half_pixel
,
const
int64_t
*
batch_indices_ptr
)
{
int
blocksPerGrid
=
(
int
)(
ceil
(
static_cast
<
float
>
(
nthreads
)
/
GridDim
::
maxThreadsPerBlock
));
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
RoIAlignForward
<
T
>
),
blocksPerGrid
,
GridDim
::
maxThreadsPerBlock
,
0
,
stream
,
nthreads
,
bottom_data
,
spatial_scale
,
channels
,
height
,
width
,
pooled_height
,
pooled_width
,
sampling_ratio
,
bottom_rois
,
roi_cols
,
top_data
,
is_mode_avg
,
half_pixel
,
batch_indices_ptr
);
}
#define SPECIALIZED_IMPL(T) \
template void RoiAlignImpl<T>( \
hipStream_t stream, \
const int64_t nthreads, \
const T* bottom_data, \
const T spatial_scale, \
const int64_t channels, \
const int64_t height, \
const int64_t width, \
const int64_t pooled_height, \
const int64_t pooled_width, \
const int64_t sampling_ratio, \
const T* bottom_rois, \
int64_t roi_cols, \
T* top_data, \
const bool is_mode_avg, \
const bool half_pixel, \
const int64_t* batch_indices_ptr);
SPECIALIZED_IMPL
(
float
)
SPECIALIZED_IMPL
(
double
)
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/object_detection/roialign_impl.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <stdint.h>
#include "core/providers/rocm/shared_inc/rocm_utils.h"
namespace
onnxruntime
{
namespace
rocm
{
template
<
typename
T
>
void
RoiAlignImpl
(
hipStream_t
stream
,
const
int64_t
nthreads
,
const
T
*
bottom_data
,
const
T
spatial_scale
,
const
int64_t
channels
,
const
int64_t
height
,
const
int64_t
width
,
const
int64_t
pooled_height
,
const
int64_t
pooled_width
,
const
int64_t
sampling_ratio
,
const
T
*
bottom_rois
,
int64_t
roi_cols
,
T
*
top_data
,
const
bool
is_mode_avg
,
const
bool
half_pixel
,
const
int64_t
*
batch_indices_ptr
);
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/reduction/reduction_functions.cc
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/shared_library/provider_api.h"
#include "core/providers/rocm/reduction/reduction_functions.h"
#include <algorithm>
#include <cassert>
#include <iterator>
#include <utility>
#include "core/common/optional.h"
#include "core/framework/tensor_shape.h"
namespace
onnxruntime
{
namespace
rocm
{
namespace
{
// gets min and max of single contiguous range of axes if available
optional
<
std
::
pair
<
int64_t
,
int64_t
>>
GetMinAndMaxContiguousAxes
(
int64_t
rank
,
const
std
::
vector
<
int64_t
>&
dims
,
const
std
::
vector
<
int64_t
>&
original_axes
)
{
assert
(
rank
==
static_cast
<
int64_t
>
(
dims
.
size
()));
// empty axes means reduce all dimensions
if
(
original_axes
.
empty
())
{
return
std
::
make_pair
(
int64_t
{
0
},
rank
-
1
);
}
// normalize axis values and sort
const
std
::
vector
<
int64_t
>
axes
=
[
&
original_axes
,
rank
]()
{
std
::
vector
<
int64_t
>
result
(
original_axes
);
std
::
for_each
(
result
.
begin
(),
result
.
end
(),
[
rank
](
int64_t
&
axis
)
{
axis
=
HandleNegativeAxis
(
axis
,
rank
);
});
std
::
sort
(
result
.
begin
(),
result
.
end
());
return
result
;
}();
assert
(
!
axes
.
empty
());
const
auto
is_dim_one
=
[](
int64_t
dim
)
{
return
dim
==
1
;
};
for
(
auto
a
=
axes
.
begin
(),
b
=
axes
.
begin
()
+
1
;
b
!=
axes
.
end
();
++
a
,
++
b
)
{
ORT_ENFORCE
(
*
a
!=
*
b
,
"axes must not contain duplicate values"
);
// if axis values are adjacent, the axes are contiguous
if
(
*
a
+
1
==
*
b
)
{
continue
;
}
// if all dimension values between adjacent axes are 1,
// treat the axes as contiguous
if
(
std
::
all_of
(
dims
.
begin
()
+
*
a
+
1
,
dims
.
begin
()
+
*
b
,
is_dim_one
))
{
continue
;
}
// otherwise, not contiguous
return
nullopt
;
}
// expand axes over surrounding dimensions with value of 1
const
int64_t
min_axis
=
[
&
dims
,
&
axes
,
&
is_dim_one
]()
->
int64_t
{
const
auto
&
min_given_axis
=
axes
.
front
();
// note that std::reverse_iterator(it) refers to the element at (it-1)
// it -> reverse it: element offset of -1
const
auto
before_min_given_axis_rit
=
std
::
make_reverse_iterator
(
dims
.
begin
()
+
min_given_axis
);
const
auto
before_min_axis_rit
=
std
::
find_if_not
(
before_min_given_axis_rit
,
dims
.
rend
(),
is_dim_one
);
// reverse it -> it: element offset of +1
return
std
::
distance
(
dims
.
begin
(),
before_min_axis_rit
.
base
());
}();
const
int64_t
max_axis
=
[
&
dims
,
&
axes
,
&
is_dim_one
]()
{
const
auto
&
max_given_axis
=
axes
.
back
();
const
auto
after_max_given_axis_it
=
dims
.
begin
()
+
max_given_axis
+
1
;
const
auto
after_max_axis_it
=
std
::
find_if_not
(
after_max_given_axis_it
,
dims
.
end
(),
is_dim_one
);
return
std
::
distance
(
dims
.
begin
(),
after_max_axis_it
-
1
);
}();
return
std
::
make_pair
(
min_axis
,
max_axis
);
}
}
// namespace
ApplicableMatrixReduction
get_applicable_matrix_reduction
(
const
miopenReduceTensorOp_t
miopen_reduce_op
,
gsl
::
span
<
const
int64_t
>
dims
,
gsl
::
span
<
const
int64_t
>
original_axes
,
int
&
m_out
,
int
&
n_out
)
{
if
(
miopen_reduce_op
!=
MIOPEN_REDUCE_TENSOR_ADD
&&
miopen_reduce_op
!=
MIOPEN_REDUCE_TENSOR_AVG
)
{
return
ApplicableMatrixReduction
::
None
;
}
// Remove all dims with value 1. This can help to optimize case like:
// dims=[2,3,1,4,1,5] and axes=[0,2,4], which is same as dims=[2,3,4,5] and axes=[0].
std
::
vector
<
int64_t
>
new_dims
;
std
::
vector
<
int64_t
>
new_axes
;
const
auto
original_rank
=
gsl
::
narrow
<
int64_t
>
(
dims
.
size
());
std
::
set
<
int64_t
>
original_axes_set
;
for
(
const
auto
axis
:
original_axes
)
{
original_axes_set
.
insert
(
HandleNegativeAxis
(
axis
,
original_rank
));
}
int64_t
new_axis
=
0
;
for
(
size_t
i
=
0
;
i
<
dims
.
size
();
i
++
)
{
if
(
dims
[
i
]
!=
1
)
{
new_dims
.
emplace_back
(
dims
[
i
]);
if
(
original_axes_set
.
find
(
gsl
::
narrow
<
int64_t
>
(
i
))
!=
original_axes_set
.
end
())
{
new_axes
.
emplace_back
(
new_axis
);
}
new_axis
++
;
}
}
// Empty axes means reduce all dimensions, which has different meaning,
// so add a new dim to the end if all original axes are on dims with value 1.
if
(
!
original_axes
.
empty
()
&&
new_axes
.
empty
())
{
new_dims
.
emplace_back
(
1
);
new_axes
.
emplace_back
(
new_axis
);
}
// If all dims are value 1, make sure it's not empty by adding a new dim.
if
(
!
dims
.
empty
()
&&
new_dims
.
empty
())
{
new_dims
.
emplace_back
(
1
);
}
const
auto
rank
=
gsl
::
narrow
<
int64_t
>
(
new_dims
.
size
());
const
auto
min_and_max_axes
=
GetMinAndMaxContiguousAxes
(
rank
,
new_dims
,
new_axes
);
if
(
!
min_and_max_axes
.
has_value
())
{
return
ApplicableMatrixReduction
::
None
;
}
const
auto
&
min_axis
=
min_and_max_axes
->
first
;
const
auto
&
max_axis
=
min_and_max_axes
->
second
;
// axes from beginning means row reduction, axes to end means column reduction
// for axes from beginning to end, either works and we do row reduction
const
bool
axes_from_beginning
=
min_axis
==
0
;
const
bool
axes_to_end
=
max_axis
==
rank
-
1
;
// handle axes anchored to beginning or end
if
(
!
axes_from_beginning
&&
!
axes_to_end
)
{
return
ApplicableMatrixReduction
::
None
;
}
// the axis index right after the last flattened into matrix rows
const
int64_t
m_end_axis
=
axes_from_beginning
?
max_axis
+
1
:
min_axis
;
const
auto
shape
=
TensorShape
::
FromExistingBuffer
(
new_dims
);
const
auto
m
=
shape
.
SizeToDimension
(
m_end_axis
);
const
auto
n
=
shape
.
SizeFromDimension
(
m_end_axis
);
ORT_ENFORCE
(
m
>
0
&&
n
>
0
,
"shape must not have negative dimensions: "
,
shape
);
if
(
m
>
std
::
numeric_limits
<
int
>::
max
()
||
n
>
std
::
numeric_limits
<
int
>::
max
())
{
return
ApplicableMatrixReduction
::
None
;
}
m_out
=
gsl
::
narrow_cast
<
int
>
(
m
);
n_out
=
gsl
::
narrow_cast
<
int
>
(
n
);
return
axes_from_beginning
?
ApplicableMatrixReduction
::
Rows
:
ApplicableMatrixReduction
::
Columns
;
}
}
// namespace rocm
}
// namespace onnxruntime
Prev
1
…
4
5
6
7
8
9
10
11
12
…
14
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment