Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
86f8ac01
Commit
86f8ac01
authored
Aug 12, 2024
by
Jakub Piasecki
Browse files
Merge remote-tracking branch 'origin/develop' into jakpias/pool1d_fwd
parents
3f6360d0
ab60b390
Changes
38
Hide whitespace changes
Inline
Side-by-side
Showing
18 changed files
with
386 additions
and
1006 deletions
+386
-1006
library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp
...wd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp
+0
-96
library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
...or_operation_instance/gpu/grouped_convolution_forward.hpp
+0
-13
library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl_merged_groups.inc
...nce/gpu/grouped_convolution_forward_xdl_merged_groups.inc
+0
-112
library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
..._operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
+0
-5
library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
...fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
+0
-48
library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instance.cpp
..._fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instance.cpp
+0
-48
library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instance.cpp
..._fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instance.cpp
+0
-48
library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
..._operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
+0
-4
library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
..._xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
+0
-47
library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
...d_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
+0
-47
library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
...d_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
+0
-47
script/process_perf_data.py
script/process_perf_data.py
+14
-0
script/process_perf_data.sh
script/process_perf_data.sh
+17
-0
script/process_qa_data.sh
script/process_qa_data.sh
+17
-0
script/test_reduce_with_index.sh
script/test_reduce_with_index.sh
+0
-63
test/reduce/CMakeLists.txt
test/reduce/CMakeLists.txt
+2
-2
test/reduce/reduce_no_index.cpp
test/reduce/reduce_no_index.cpp
+168
-213
test/reduce/reduce_with_index.cpp
test/reduce/reduce_with_index.cpp
+168
-213
No files found.
library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp
deleted
100644 → 0
View file @
3f6360d0
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
BF16
=
ck
::
bhalf_t
;
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
Empty_Tuple
=
ck
::
Tuple
<>
;
using
namespace
ck
::
tensor_layout
::
convolution
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
static
constexpr
auto
ConvFwdDefault
=
ck
::
tensor_operation
::
device
::
ConvolutionForwardSpecialization
::
Default
;
static
constexpr
auto
ConvFwd3x3
=
ConvolutionForwardSpecialization
::
Filter3x3
;
static
constexpr
auto
GemmMNKPadding
=
GemmSpecialization
::
MNKPadding
;
template
<
index_t
NDimSpatial
,
typename
ALayout
,
typename
BLayout
,
typename
DsLayout
,
typename
ELayout
,
ConvolutionForwardSpecialization
ConvSpec
>
using
device_grouped_conv_fwd_xdl_merged_groups_bf16_instances
=
std
::
tuple
<
// clang-format off
//########################################| NumDim| A| B| Ds| E| AData| BData| AccData| CShuffle| Ds| EData| A| B| CDE| ConvForward| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| ACompute| BCompute| BlockGemm| NumGroups|
//########################################| Spatial| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| DataType| Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| Type| Type| Pipeline| ToMerge|
//########################################| | | | | | | | | | | | Operation| Operation| Operation| | | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| | | Scheduler| |
//########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
// Instances with NumGroupsPerBatch > 1
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
BF16
,
BF16
,
F32
,
BF16
,
DsLayout
,
BF16
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
GemmMNKPadding
,
1
,
64
,
64
,
16
,
16
,
4
,
4
,
16
,
16
,
4
,
1
,
S
<
4
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
1
,
BF16
,
BF16
,
LoopScheduler
::
Default
,
8
>
,
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
BF16
,
BF16
,
F32
,
BF16
,
DsLayout
,
BF16
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
GemmMNKPadding
,
1
,
64
,
64
,
16
,
16
,
4
,
4
,
16
,
16
,
4
,
1
,
S
<
4
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
1
,
BF16
,
BF16
,
LoopScheduler
::
Default
,
16
>
,
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
BF16
,
BF16
,
F32
,
BF16
,
DsLayout
,
BF16
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
GemmMNKPadding
,
1
,
64
,
64
,
16
,
16
,
4
,
4
,
16
,
16
,
4
,
1
,
S
<
4
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
1
,
BF16
,
BF16
,
LoopScheduler
::
Default
,
32
>
// clang-format on
>
;
template
<
index_t
NDimSpatial
,
typename
ALayout
,
typename
BLayout
,
typename
DsLayout
,
typename
ELayout
,
ConvolutionForwardSpecialization
ConvSpec
>
using
device_grouped_conv_fwd_xdl_merged_groups_f16_instances
=
std
::
tuple
<
// clang-format off
//########################################| NumDim| A| B| Ds| E| AData| BData| AccData| CShuffle| Ds| EData| A| B| CDE| ConvForward| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
//########################################| Spatial| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| DataType| Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//########################################| | | | | | | | | | | | Operation| Operation| Operation| | | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
// Instances with NumGroupsPerBatch > 1
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
F32
,
F16
,
DsLayout
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
GemmMNKPadding
,
1
,
64
,
64
,
16
,
16
,
4
,
4
,
16
,
16
,
4
,
1
,
S
<
4
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
1
,
F16
,
F16
,
LoopScheduler
::
Default
,
8
>
,
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
F32
,
F16
,
DsLayout
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
GemmMNKPadding
,
1
,
64
,
64
,
16
,
16
,
4
,
4
,
16
,
16
,
4
,
1
,
S
<
4
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
1
,
F16
,
F16
,
LoopScheduler
::
Default
,
16
>
,
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
F32
,
F16
,
DsLayout
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
GemmMNKPadding
,
1
,
64
,
64
,
16
,
16
,
4
,
4
,
16
,
16
,
4
,
1
,
S
<
4
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
1
,
F16
,
F16
,
LoopScheduler
::
Default
,
32
>
// clang-format on
>
;
template
<
index_t
NDimSpatial
,
typename
ALayout
,
typename
BLayout
,
typename
DsLayout
,
typename
ELayout
,
ConvolutionForwardSpecialization
ConvSpec
>
using
device_grouped_conv_fwd_xdl_merged_groups_f32_instances
=
std
::
tuple
<
// clang-format off
//########################################| NumDim| A| B| Ds| E| AData| BData| AccData| CShuffle| Ds| EData| A| B| CDE| ConvForward| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
//########################################| Spatial| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| DataType| Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//########################################| | | | | | | | | | | | Operation| Operation| Operation| | | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
// Instances with NumGroupsPerBatch > 1
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F32
,
F32
,
F32
,
F32
,
DsLayout
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
GemmMNKPadding
,
1
,
64
,
64
,
16
,
16
,
4
,
4
,
16
,
16
,
4
,
1
,
S
<
4
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
1
,
F32
,
F32
,
LoopScheduler
::
Default
,
8
>
,
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F32
,
F32
,
F32
,
F32
,
DsLayout
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
GemmMNKPadding
,
1
,
64
,
64
,
16
,
16
,
4
,
4
,
16
,
16
,
4
,
1
,
S
<
4
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
1
,
F32
,
F32
,
LoopScheduler
::
Default
,
16
>
,
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F32
,
F32
,
F32
,
F32
,
DsLayout
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
GemmMNKPadding
,
1
,
64
,
64
,
16
,
16
,
4
,
4
,
16
,
16
,
4
,
1
,
S
<
4
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
1
,
F32
,
F32
,
LoopScheduler
::
Default
,
32
>
// clang-format on
>
;
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
View file @
86f8ac01
...
...
@@ -18,7 +18,6 @@
#ifdef CK_USE_XDL
#include "grouped_convolution_forward_xdl.inc"
#include "grouped_convolution_forward_xdl_large_tensor.inc"
#include "grouped_convolution_forward_xdl_merged_groups.inc"
#include "grouped_convolution_forward_comp_xdl.inc"
#include "grouped_convolution_forward_mem_inter_xdl.inc"
#include "grouped_convolution_forward_mem_intra_xdl.inc"
...
...
@@ -203,8 +202,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instances
(
op_ptrs
);
add_device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instances
(
op_ptrs
);
add_device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instances
(
op_ptrs
);
add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instances
(
op_ptrs
);
add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instances
(
op_ptrs
);
...
...
@@ -220,8 +217,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instances
(
op_ptrs
);
add_device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f16_instances
(
op_ptrs
);
add_device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instances
(
op_ptrs
);
add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instances
(
op_ptrs
);
add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instances
(
op_ptrs
);
...
...
@@ -239,8 +234,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instances
(
op_ptrs
);
add_device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instances
(
op_ptrs
);
add_device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instances
(
op_ptrs
);
add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instances
(
op_ptrs
);
add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instances
(
op_ptrs
);
...
...
@@ -300,8 +293,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances
(
op_ptrs
);
add_device_grouped_conv3d_fwd_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instances
(
op_ptrs
);
add_device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instances
(
op_ptrs
);
add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instances
(
op_ptrs
);
add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instances
(
op_ptrs
);
...
...
@@ -358,8 +349,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances
(
op_ptrs
);
add_device_grouped_conv3d_fwd_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instances
(
op_ptrs
);
add_device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instances
(
op_ptrs
);
add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instances
(
op_ptrs
);
add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_intra_instances
(
op_ptrs
);
...
...
@@ -377,8 +366,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances
(
op_ptrs
);
add_device_grouped_conv3d_fwd_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instances
(
op_ptrs
);
add_device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instances
(
op_ptrs
);
add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances
(
op_ptrs
);
add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instances
(
op_ptrs
);
...
...
library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl_merged_groups.inc
deleted
100644 → 0
View file @
3f6360d0
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// grouped conv2d forward, NHWGC/GKYXC/NHWGK
#ifdef CK_ENABLE_BF16
void
add_device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleABD
<
2
,
NHWGC
,
GKYXC
,
Empty_Tuple
,
NHWGK
,
BF16
,
BF16
,
Empty_Tuple
,
BF16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_FP16
void
add_device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleABD
<
2
,
NHWGC
,
GKYXC
,
Empty_Tuple
,
NHWGK
,
F16
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_FP32
void
add_device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleABD
<
2
,
NHWGC
,
GKYXC
,
Empty_Tuple
,
NHWGK
,
F32
,
F32
,
Empty_Tuple
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_BF16
// grouped conv3d forward, NDHWGC/GKZYXC/NDHWGK
void
add_device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleABD
<
3
,
NDHWGC
,
GKZYXC
,
Empty_Tuple
,
NDHWGK
,
BF16
,
BF16
,
Empty_Tuple
,
BF16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_FP16
void
add_device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleABD
<
3
,
NDHWGC
,
GKZYXC
,
Empty_Tuple
,
NDHWGK
,
F16
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_FP32
void
add_device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleABD
<
3
,
NDHWGC
,
GKZYXC
,
Empty_Tuple
,
NDHWGK
,
F32
,
F32
,
Empty_Tuple
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
View file @
86f8ac01
...
...
@@ -14,11 +14,6 @@ add_instance_library(device_grouped_conv2d_fwd_instance
xdl/large_tensor/device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
xdl/large_tensor/device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f16_instance.cpp
xdl/large_tensor/device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instance.cpp
# merged groups
# NHWGC, GKYXC, NHWGK
xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instance.cpp
xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instance.cpp
#mem
# NHWGC, GKYXC, NHWGK
xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp
...
...
library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
deleted
100644 → 0
View file @
3f6360d0
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
void
add_device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleABD
<
2
,
NHWGC
,
GKYXC
,
Empty_Tuple
,
NHWGK
,
BF16
,
BF16
,
Empty_Tuple
,
BF16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_grouped_conv_fwd_xdl_merged_groups_bf16_instances
<
2
,
NHWGC
,
GKYXC
,
Empty_Tuple
,
NHWGK
,
ConvFwdDefault
>
{});
add_device_operation_instances
(
instances
,
device_grouped_conv_fwd_xdl_merged_groups_bf16_instances
<
2
,
NHWGC
,
GKYXC
,
Empty_Tuple
,
NHWGK
,
ConvFwd3x3
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instance.cpp
deleted
100644 → 0
View file @
3f6360d0
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
void
add_device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleABD
<
2
,
NHWGC
,
GKYXC
,
Empty_Tuple
,
NHWGK
,
F16
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_grouped_conv_fwd_xdl_merged_groups_f16_instances
<
2
,
NHWGC
,
GKYXC
,
Empty_Tuple
,
NHWGK
,
ConvFwdDefault
>
{});
add_device_operation_instances
(
instances
,
device_grouped_conv_fwd_xdl_merged_groups_f16_instances
<
2
,
NHWGC
,
GKYXC
,
Empty_Tuple
,
NHWGK
,
ConvFwd3x3
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instance.cpp
deleted
100644 → 0
View file @
3f6360d0
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
void
add_device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleABD
<
2
,
NHWGC
,
GKYXC
,
Empty_Tuple
,
NHWGK
,
F32
,
F32
,
Empty_Tuple
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_grouped_conv_fwd_xdl_merged_groups_f32_instances
<
2
,
NHWGC
,
GKYXC
,
Empty_Tuple
,
NHWGK
,
ConvFwdDefault
>
{});
add_device_operation_instances
(
instances
,
device_grouped_conv_fwd_xdl_merged_groups_f32_instances
<
2
,
NHWGC
,
GKYXC
,
Empty_Tuple
,
NHWGK
,
ConvFwd3x3
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
View file @
86f8ac01
...
...
@@ -13,10 +13,6 @@ set(GROUPED_CONV3D_FWD
xdl/large_tensor/device_grouped_conv3d_fwd_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
xdl/large_tensor/device_grouped_conv3d_fwd_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instance.cpp
xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instance.cpp
xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instance.cpp
...
...
library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
deleted
100644 → 0
View file @
3f6360d0
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleABD
<
3
,
NDHWGC
,
GKZYXC
,
Empty_Tuple
,
NDHWGK
,
BF16
,
BF16
,
Empty_Tuple
,
BF16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_grouped_conv_fwd_xdl_merged_groups_bf16_instances
<
3
,
NDHWGC
,
GKZYXC
,
Empty_Tuple
,
NDHWGK
,
ConvFwdDefault
>
{});
add_device_operation_instances
(
instances
,
device_grouped_conv_fwd_xdl_merged_groups_bf16_instances
<
3
,
NDHWGC
,
GKZYXC
,
Empty_Tuple
,
NDHWGK
,
ConvFwd3x3
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
deleted
100644 → 0
View file @
3f6360d0
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleABD
<
3
,
NDHWGC
,
GKZYXC
,
Empty_Tuple
,
NDHWGK
,
F16
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_grouped_conv_fwd_xdl_merged_groups_f16_instances
<
3
,
NDHWGC
,
GKZYXC
,
Empty_Tuple
,
NDHWGK
,
ConvFwdDefault
>
{});
add_device_operation_instances
(
instances
,
device_grouped_conv_fwd_xdl_merged_groups_f16_instances
<
3
,
NDHWGC
,
GKZYXC
,
Empty_Tuple
,
NDHWGK
,
ConvFwd3x3
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
deleted
100644 → 0
View file @
3f6360d0
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleABD
<
3
,
NDHWGC
,
GKZYXC
,
Empty_Tuple
,
NDHWGK
,
F32
,
F32
,
Empty_Tuple
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_grouped_conv_fwd_xdl_merged_groups_f32_instances
<
3
,
NDHWGC
,
GKZYXC
,
Empty_Tuple
,
NDHWGK
,
ConvFwdDefault
>
{});
add_device_operation_instances
(
instances
,
device_grouped_conv_fwd_xdl_merged_groups_f32_instances
<
3
,
NDHWGC
,
GKZYXC
,
Empty_Tuple
,
NDHWGK
,
ConvFwd3x3
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
script/process_perf_data.py
View file @
86f8ac01
...
...
@@ -143,6 +143,12 @@ def parse_logfile(logfile):
if
'Best Perf'
in
line
:
lst
=
line
.
split
()
res
.
append
(
lst
[
36
])
elif
'perf_fmha'
in
logfile
:
for
line
in
open
(
logfile
):
if
'TFlops'
in
line
:
lst
=
line
.
split
()
line_dict
=
dict
(
zip
(
lst
[
1
:],
lst
))
res
.
append
(
line_dict
[
'TFlops,'
])
return
res
...
...
@@ -304,6 +310,14 @@ def main():
for
i
in
range
(
1
,
len
(
results
)
+
1
):
testlist
.
append
(
"Test%i"
%
i
)
table_name
=
"ck_mixed_gemm_tflops"
if
'fmha_fwd'
in
filename
:
for
i
in
range
(
1
,
len
(
results
)
+
1
):
testlist
.
append
(
"Test%i"
%
i
)
table_name
=
"ck_fmha_fwd_tflops"
if
'fmha_bwd'
in
filename
:
for
i
in
range
(
1
,
len
(
results
)
+
1
):
testlist
.
append
(
"Test%i"
%
i
)
table_name
=
"ck_fmha_bwd_tflops"
tflops_base
=
get_baseline
(
table_name
,
conn
)
store_new_test_result
(
table_name
,
results
,
testlist
,
branch_name
,
node_id
,
gpu_arch
,
compute_units
,
rocm_vers
,
hip_vers
,
environment
,
conn
)
...
...
script/process_perf_data.sh
View file @
86f8ac01
...
...
@@ -13,3 +13,20 @@
python3 process_perf_data.py perf_gemm.log
python3 process_perf_data.py perf_resnet50_N256.log
python3 process_perf_data.py perf_resnet50_N4.log
file
=
./perf_fmha_fwd_gfx942.log
if
[
-e
"
$file
"
]
;
then
python3 process_perf_data.py perf_fmha_fwd_gfx942.log
fi
file
=
./perf_fmha_bwd_gfx942.log
if
[
-e
"
$file
"
]
;
then
python3 process_perf_data.py perf_fmha_bwd_gfx942.log
fi
file
=
./perf_fmha_fwd_gfx90a.log
if
[
-e
"
$file
"
]
;
then
python3 process_perf_data.py perf_fmha_fwd_gfx90a.log
fi
file
=
./perf_fmha_bwd_gfx90a.log
if
[
-e
"
$file
"
]
;
then
python3 process_perf_data.py perf_fmha_bwd_gfx90a.log
fi
script/process_qa_data.sh
View file @
86f8ac01
...
...
@@ -21,3 +21,20 @@ python3 process_perf_data.py perf_gemm_bilinear.log
python3 process_perf_data.py perf_reduction.log
python3 process_perf_data.py perf_splitK_gemm.log
python3 process_perf_data.py perf_onnx_gemm.log
file
=
./perf_fmha_fwd_gfx942.log
if
[
-e
"
$file
"
]
;
then
python3 process_perf_data.py perf_fmha_fwd_gfx942.log
fi
file
=
./perf_fmha_bwd_gfx942.log
if
[
-e
"
$file
"
]
;
then
python3 process_perf_data.py perf_fmha_bwd_gfx942.log
fi
file
=
./perf_fmha_fwd_gfx90a.log
if
[
-e
"
$file
"
]
;
then
python3 process_perf_data.py perf_fmha_fwd_gfx90a.log
fi
file
=
./perf_fmha_bwd_gfx90a.log
if
[
-e
"
$file
"
]
;
then
python3 process_perf_data.py perf_fmha_bwd_gfx90a.log
fi
script/test_reduce_with_index.sh
deleted
100755 → 0
View file @
3f6360d0
#!/bin/bash
## The following will be used for CI
set
-x
## for float
bin/test_reduce_with_index
-D
64,4,280,82
-R
0,1,2,3 0 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
0,1,2 0 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
0,1,3 0 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
0,2,3 0 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
1,2,3 0 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
0 0 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
1 0 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
2 0 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
3 0 2
## for float64
bin/test_reduce_with_index
-D
64,4,280,82
-R
0,1,2,3 6 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
0,1,2 6 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
0,1,3 6 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
0,2,3 6 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
1,2,3 6 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
0 6 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
1 6 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
2 6 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
3 6 2
## for float16
bin/test_reduce_with_index
-D
64,4,280,82
-R
0,1,2,3 1 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
0,1,2 1 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
0,1,3 1 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
0,2,3 1 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
1,2,3 1 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
0 1 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
1 1 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
2 1 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
3 1 2
## for int8_t
bin/test_reduce_with_index
-D
64,4,280,82
-R
0,1,2,3 3 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
0,1,2 3 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
0,1,3 3 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
0,2,3 3 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
1,2,3 3 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
0 3 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
1 3 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
2 3 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
3 3 2
## for bfloat16
bin/test_reduce_with_index
-D
64,4,280,82
-R
0,1,2,3 5 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
0,1,2 5 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
0,1,3 5 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
0,2,3 5 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
1,2,3 5 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
0 5 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
1 5 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
2 5 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
3 5 2
set
+x
test/reduce/CMakeLists.txt
View file @
86f8ac01
add_test_executable
(
test_reduce_no_index reduce_no_index.cpp
)
add_test_executable
(
test_reduce_with_index reduce_with_index.cpp
)
add_
g
test_executable
(
test_reduce_no_index reduce_no_index.cpp
)
add_
g
test_executable
(
test_reduce_with_index reduce_with_index.cpp
)
target_link_libraries
(
test_reduce_no_index PRIVATE utility device_reduce_instance
)
target_link_libraries
(
test_reduce_with_index PRIVATE utility device_reduce_instance
)
test/reduce/reduce_no_index.cpp
View file @
86f8ac01
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
#include <getopt.h>
#include "ck/library/utility/host_common_util.hpp"
#include "profiler/profile_reduce_impl.hpp"
#include <gtest/gtest.h>
using
namespace
ck
;
static
struct
option
long_options
[]
=
{{
"inLengths"
,
required_argument
,
nullptr
,
'D'
},
{
"reduceDimensions"
,
required_argument
,
nullptr
,
'R'
},
{
"scales"
,
required_argument
,
nullptr
,
'S'
},
{
"help"
,
no_argument
,
nullptr
,
'?'
},
{
nullptr
,
0
,
nullptr
,
0
}};
struct
ReduceParam
{
bool
do_verification
{
true
};
bool
propagateNan
{
false
};
bool
useIndex
{
false
};
bool
time_kernel
{
false
};
bool
do_dumpout
{
false
};
int
init_method
{
2
};
float
alpha
{
1.0
f
};
float
beta
{
0.0
f
};
std
::
vector
<
size_t
>
inLengths
{
64
,
4
,
280
,
82
};
std
::
vector
<
int
>
reduceDims
{
0
,
1
,
2
,
3
};
};
class
SimpleAppArgs
std
::
vector
<
std
::
vector
<
int
>>
SetGenericReduceDim
()
{
private:
int
option_index
=
0
;
return
{{
0
,
1
,
2
,
3
},
{
0
,
1
,
2
},
{
0
,
1
,
3
},
{
0
,
2
,
3
},
{
1
,
2
,
3
},
{
0
},
{
1
},
{
2
},
{
3
}};
}
public:
std
::
vector
<
size_t
>
inLengths
;
std
::
vector
<
int
>
reduceDims
;
std
::
vector
<
float
>
scales
;
template
<
typename
T
>
class
ReduceWithIndexTest
:
public
::
testing
::
Test
{
protected:
using
InDataType
=
std
::
tuple_element_t
<
0
,
T
>
;
using
AccDataType
=
std
::
tuple_element_t
<
1
,
T
>
;
using
OutDataType
=
std
::
tuple_element_t
<
2
,
T
>
;
int
data_type
;
int
init_method
=
1
;
static
std
::
vector
<
ReduceParam
>
params
;
public:
void
show_usage
(
const
char
*
cmd
)
static
void
SetUpTestSuite
()
{
std
::
cout
<<
"Usage of "
<<
cmd
<<
std
::
endl
;
std
::
cout
<<
"--inLengths or -D, comma separated list of input tensor dimension lengths "
"(only 4-d tensor supported)"
<<
std
::
endl
;
std
::
cout
<<
"--reduceDimensions or -R comma seperated list of dimension indexes to reduce "
"(only 1 or 3 or 4 dimensions supported)"
<<
std
::
endl
;
std
::
cout
<<
"--scales or -S, comma separated two float values for alpha and beta"
<<
std
::
endl
;
std
::
cout
<<
"Arg1 -- data type (0: fp16, 1: fp32, 3: int8, 5: bp16, 6: fp64)"
<<
std
::
endl
;
std
::
cout
<<
"Arg2 -- init method(0=no init, 1=single integer value, 2=scope integer "
"value, 3=decimal value)"
<<
std
::
endl
;
};
int
processArgs
(
int
argc
,
char
*
argv
[])
{
using
ck
::
host_common
::
getTypeValuesFromString
;
int
ch
;
// set testcase variables
ReduceParam
set
;
const
auto
setReduceDim
=
SetGenericReduceDim
();
while
(
1
)
for
(
std
::
size_t
i
(
0
);
i
<
setReduceDim
.
size
();
++
i
)
{
ch
=
getopt_long
(
argc
,
argv
,
"D:R:S:"
,
long_options
,
&
option_index
);
if
(
ch
==
-
1
)
break
;
switch
(
ch
)
{
case
'D'
:
if
(
!
optarg
)
throw
std
::
runtime_error
(
"Invalid option format!"
);
inLengths
=
getTypeValuesFromString
<
size_t
>
(
optarg
);
break
;
case
'R'
:
if
(
!
optarg
)
throw
std
::
runtime_error
(
"Invalid option format!"
);
reduceDims
=
getTypeValuesFromString
<
int
>
(
optarg
);
break
;
case
'S'
:
if
(
!
optarg
)
throw
std
::
runtime_error
(
"Invalid option format!"
);
scales
=
getTypeValuesFromString
<
float
>
(
optarg
);
break
;
case
'?'
:
if
(
std
::
string
(
long_options
[
option_index
].
name
)
==
"help"
)
{
show_usage
(
argv
[
0
]);
return
(
-
1
);
};
break
;
default:
show_usage
(
argv
[
0
]);
return
(
-
1
);
};
};
if
(
optind
+
2
>
argc
)
throw
std
::
runtime_error
(
"Invalid cmd-line arguments, more argumetns are needed!"
);
data_type
=
std
::
atoi
(
argv
[
optind
++
]);
init_method
=
std
::
atoi
(
argv
[
optind
]);
if
(
scales
.
empty
())
set
.
reduceDims
=
setReduceDim
[
i
];
params
.
emplace_back
(
set
);
}
}
template
<
ReduceTensorOp
ReduceOpIdType
>
void
Run
()
{
for
(
auto
param
:
this
->
params
)
{
scales
.
push_back
(
1.0
f
);
scales
.
push_back
(
0.0
f
);
};
bool
success
=
ck
::
profiler
::
profile_reduce_impl
<
InDataType
,
AccDataType
,
OutDataType
>
(
param
.
do_verification
,
param
.
init_method
,
param
.
do_dumpout
,
param
.
time_kernel
,
param
.
inLengths
,
param
.
reduceDims
,
ReduceOpIdType
,
param
.
propagateNan
,
param
.
useIndex
,
param
.
alpha
,
param
.
beta
);
EXPECT_TRUE
(
success
);
}
}
};
if
(
inLengths
.
size
()
!=
4
||
(
reduceDims
.
size
()
!=
1
&&
reduceDims
.
size
()
!=
3
&&
reduceDims
.
size
()
!=
4
))
return
(
-
1
);
template
<
typename
T
>
std
::
vector
<
ReduceParam
>
ReduceWithIndexTest
<
T
>::
params
=
{};
if
(
data_type
!=
0
&&
data_type
!=
1
&&
data_type
!=
3
&&
data_type
!=
5
&&
data_type
!=
6
)
return
(
-
1
);
using
Reduce_float_types
=
::
testing
::
Types
<
std
::
tuple
<
float
,
float
,
float
>>
;
using
Reduce_double_types
=
::
testing
::
Types
<
std
::
tuple
<
double
,
double
,
double
>>
;
using
Reduce_int8t_types
=
::
testing
::
Types
<
std
::
tuple
<
int8_t
,
int8_t
,
int8_t
>>
;
using
Reduce_half_types
=
::
testing
::
Types
<
std
::
tuple
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
>>
;
using
Reduce_bhalf_float_Types
=
::
testing
::
Types
<
std
::
tuple
<
ck
::
bhalf_t
,
float
,
ck
::
bhalf_t
>>
;
return
(
0
);
};
template
<
typename
TType
>
class
ReduceWithNoIndexFloat
:
public
ReduceWithIndexTest
<
TType
>
{
};
bool
test_reduce_no_index
(
int
data_type
,
int
init_method
,
std
::
vector
<
int
>
reduceDims
,
std
::
vector
<
size_t
>
inLengths
,
ReduceTensorOp
reduceOpId
,
bool
propagateNan
,
float
alpha
,
float
beta
)
template
<
typename
TType
>
class
ReduceWithNoIndexDouble
:
public
ReduceWithIndexTest
<
TType
>
{
using
ck
::
profiler
::
profile_reduce_impl
;
}
;
bool
result
=
true
;
template
<
typename
TType
>
class
ReduceWithNoIndexInt8
:
public
ReduceWithIndexTest
<
TType
>
{
};
if
(
data_type
==
0
)
{
result
=
profile_reduce_impl
<
float
,
float
,
float
>
(
true
,
init_method
,
false
,
false
,
inLengths
,
reduceDims
,
reduceOpId
,
propagateNan
,
false
,
alpha
,
beta
);
}
else
if
(
data_type
==
1
)
{
result
=
profile_reduce_impl
<
ck
::
half_t
,
float
,
ck
::
half_t
>
(
true
,
init_method
,
false
,
false
,
inLengths
,
reduceDims
,
reduceOpId
,
propagateNan
,
false
,
alpha
,
beta
);
}
else
if
(
data_type
==
3
)
{
result
=
profile_reduce_impl
<
int8_t
,
int32_t
,
int8_t
>
(
true
,
init_method
,
false
,
false
,
inLengths
,
reduceDims
,
reduceOpId
,
propagateNan
,
false
,
alpha
,
beta
);
}
else
if
(
data_type
==
5
)
{
result
=
profile_reduce_impl
<
ck
::
bhalf_t
,
float
,
ck
::
bhalf_t
>
(
true
,
init_method
,
false
,
false
,
inLengths
,
reduceDims
,
reduceOpId
,
propagateNan
,
false
,
alpha
,
beta
);
}
else
if
(
data_type
==
6
)
{
result
=
profile_reduce_impl
<
double
,
double
,
double
>
(
true
,
init_method
,
false
,
false
,
inLengths
,
reduceDims
,
reduceOpId
,
propagateNan
,
false
,
alpha
,
beta
);
}
template
<
typename
TType
>
class
ReduceWithNoIndexHalf
:
public
ReduceWithIndexTest
<
TType
>
{
};
return
(
result
);
template
<
typename
TType
>
class
ReduceWithNoIndexBHalfFloat
:
public
ReduceWithIndexTest
<
TType
>
{
};
constexpr
ReduceTensorOp
reduceOpId
=
ReduceTensorOp
::
AVG
;
constexpr
bool
propagateNan
=
false
;
TYPED_TEST_SUITE
(
ReduceWithNoIndexFloat
,
Reduce_float_types
);
TYPED_TEST_SUITE
(
ReduceWithNoIndexDouble
,
Reduce_double_types
);
TYPED_TEST_SUITE
(
ReduceWithNoIndexInt8
,
Reduce_int8t_types
);
TYPED_TEST_SUITE
(
ReduceWithNoIndexHalf
,
Reduce_half_types
);
TYPED_TEST_SUITE
(
ReduceWithNoIndexBHalfFloat
,
Reduce_bhalf_float_Types
);
int
main
(
int
argc
,
char
*
argv
[]
)
TYPED_TEST
(
ReduceWithNoIndexFloat
,
ReduceWithNoIndexTestFloat_AMAX
)
{
SimpleAppArgs
args
;
// trigger Run() -> Generic
this
->
template
Run
<
ReduceTensorOp
::
AMAX
>();
}
bool
result
=
true
;
TYPED_TEST
(
ReduceWithNoIndexFloat
,
ReduceWithNoIndexTestFloat_MIN
)
{
// trigger Run() -> Generic
this
->
template
Run
<
ReduceTensorOp
::
MIN
>();
}
if
(
argc
==
1
)
{
int
data_type
=
1
;
int
init_method
=
2
;
std
::
vector
<
size_t
>
inLengths
{
64
,
4
,
280
,
80
};
std
::
vector
<
std
::
vector
<
int
>>
v_reduceDims
{
{
0
,
1
,
2
,
3
},
{
0
,
1
,
2
},
{
1
,
2
,
3
},
{
0
,
1
,
3
},
{
0
,
2
,
3
},
{
0
},
{
1
},
{
2
},
{
3
}};
for
(
auto
&
reduceDims
:
v_reduceDims
)
result
=
result
&&
test_reduce_no_index
(
data_type
,
init_method
,
reduceDims
,
inLengths
,
reduceOpId
,
propagateNan
,
1.0
f
,
0.0
f
);
}
else
{
if
(
args
.
processArgs
(
argc
,
argv
)
<
0
)
{
throw
std
::
runtime_error
(
"Invalid input arguments, test_reduce_no_index could not be executed!"
);
};
result
=
test_reduce_no_index
(
args
.
data_type
,
args
.
init_method
,
args
.
reduceDims
,
args
.
inLengths
,
reduceOpId
,
propagateNan
,
args
.
scales
[
0
],
args
.
scales
[
1
]
);
}
TYPED_TEST
(
ReduceWithNoIndexFloat
,
ReduceWithNoIndexTestFloat_MAX
)
{
// trigger Run() -> Generic
this
->
template
Run
<
ReduceTensorOp
::
MAX
>()
;
}
TYPED_TEST
(
ReduceWithNoIndexDouble
,
ReduceWithNoIndexTestDouble_AMAX
)
{
// trigger Run() -> Generic
this
->
template
Run
<
ReduceTensorOp
::
AMAX
>();
}
TYPED_TEST
(
ReduceWithNoIndexDouble
,
ReduceWithNoIndexTestDouble_MIN
)
{
// trigger Run() -> Generic
this
->
template
Run
<
ReduceTensorOp
::
MIN
>();
}
TYPED_TEST
(
ReduceWithNoIndexDouble
,
ReduceWithNoIndexTestDouble_MAX
)
{
// trigger Run() -> Generic
this
->
template
Run
<
ReduceTensorOp
::
MAX
>();
}
TYPED_TEST
(
ReduceWithNoIndexInt8
,
ReduceWithNoIndexTestInt8_AMAX
)
{
// trigger Run() -> Generic
this
->
template
Run
<
ReduceTensorOp
::
AMAX
>();
}
TYPED_TEST
(
ReduceWithNoIndexInt8
,
ReduceWithNoIndexTestInt8_MIN
)
{
// trigger Run() -> Generic
this
->
template
Run
<
ReduceTensorOp
::
MIN
>(
);
}
std
::
cout
<<
"test_reduce_no_index ..... "
<<
(
result
?
"SUCCESS"
:
"FAILURE"
)
<<
std
::
endl
;
TYPED_TEST
(
ReduceWithNoIndexInt8
,
ReduceWithNoIndexTestInt8_MAX
)
{
// trigger Run() -> Generic
this
->
template
Run
<
ReduceTensorOp
::
MAX
>();
}
TYPED_TEST
(
ReduceWithNoIndexHalf
,
ReduceWithNoIndexTestHalf_AMAX
)
{
// trigger Run() -> Generic
this
->
template
Run
<
ReduceTensorOp
::
AMAX
>();
}
TYPED_TEST
(
ReduceWithNoIndexHalf
,
ReduceWithNoIndexTestHalf_MIN
)
{
// trigger Run() -> Generic
this
->
template
Run
<
ReduceTensorOp
::
MIN
>();
}
TYPED_TEST
(
ReduceWithNoIndexHalf
,
ReduceWithNoIndexTestHalf_MAX
)
{
// trigger Run() -> Generic
this
->
template
Run
<
ReduceTensorOp
::
MAX
>();
}
TYPED_TEST
(
ReduceWithNoIndexBHalfFloat
,
ReduceWithNoIndexTesBtHalfFloat_AMAX
)
{
// trigger Run() -> Generic
this
->
template
Run
<
ReduceTensorOp
::
AMAX
>();
}
return
(
result
?
0
:
-
1
);
TYPED_TEST
(
ReduceWithNoIndexBHalfFloat
,
ReduceWithNoIndexTestBHalfFloat_MIN
)
{
// trigger Run() -> Generic
this
->
template
Run
<
ReduceTensorOp
::
MIN
>();
}
TYPED_TEST
(
ReduceWithNoIndexBHalfFloat
,
ReduceWithNoIndexTestBHalfFloat_MAX
)
{
// trigger Run() -> Generic
this
->
template
Run
<
ReduceTensorOp
::
MAX
>();
}
test/reduce/reduce_with_index.cpp
View file @
86f8ac01
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
#include <getopt.h>
#include "ck/library/utility/host_common_util.hpp"
#include "profiler/profile_reduce_impl.hpp"
#include <gtest/gtest.h>
using
namespace
ck
;
static
struct
option
long_options
[]
=
{{
"inLengths"
,
required_argument
,
nullptr
,
'D'
},
{
"reduceDimensions"
,
required_argument
,
nullptr
,
'R'
},
{
"scales"
,
required_argument
,
nullptr
,
'S'
},
{
"help"
,
no_argument
,
nullptr
,
'?'
},
{
nullptr
,
0
,
nullptr
,
0
}};
struct
ReduceParam
{
bool
do_verification
{
true
};
bool
propagateNan
{
false
};
bool
useIndex
{
false
};
bool
time_kernel
{
false
};
bool
do_dumpout
{
false
};
int
init_method
{
2
};
float
alpha
{
1.0
f
};
float
beta
{
0.0
f
};
std
::
vector
<
size_t
>
inLengths
{
64
,
4
,
280
,
82
};
std
::
vector
<
int
>
reduceDims
{
0
,
1
,
2
,
3
};
};
class
SimpleAppArgs
std
::
vector
<
std
::
vector
<
int
>>
SetGenericReduceDim
()
{
private:
int
option_index
=
0
;
return
{{
0
,
1
,
2
,
3
},
{
0
,
1
,
2
},
{
0
,
1
,
3
},
{
0
,
2
,
3
},
{
1
,
2
,
3
},
{
0
},
{
1
},
{
2
},
{
3
}};
}
public:
std
::
vector
<
size_t
>
inLengths
;
std
::
vector
<
int
>
reduceDims
;
std
::
vector
<
float
>
scales
;
template
<
typename
T
>
class
ReduceWithIndexTest
:
public
::
testing
::
Test
{
protected:
using
InDataType
=
std
::
tuple_element_t
<
0
,
T
>
;
using
AccDataType
=
std
::
tuple_element_t
<
1
,
T
>
;
using
OutDataType
=
std
::
tuple_element_t
<
2
,
T
>
;
int
data_type
;
int
init_method
=
1
;
static
std
::
vector
<
ReduceParam
>
params
;
public:
void
show_usage
(
const
char
*
cmd
)
static
void
SetUpTestSuite
()
{
std
::
cout
<<
"Usage of "
<<
cmd
<<
std
::
endl
;
std
::
cout
<<
"--inLengths or -D, comma separated list of input tensor dimension lengths "
"(only 4-d tensor supported)"
<<
std
::
endl
;
std
::
cout
<<
"--reduceDimensions or -R comma seperated list of dimension indexes to reduce "
"(only 1 or 3 or 4 dimensions supported)"
<<
std
::
endl
;
std
::
cout
<<
"--scales or -S, comma separated two float values for alpha and beta"
<<
std
::
endl
;
std
::
cout
<<
"Arg1 -- data type (1: fp32, 3: int8, 5: bp16, 6: fp64)"
<<
std
::
endl
;
std
::
cout
<<
"Arg2 -- init method(0=no init, 1=single integer value, 2=scope integer "
"value, 3=decimal value)"
<<
std
::
endl
;
};
int
processArgs
(
int
argc
,
char
*
argv
[])
{
using
ck
::
host_common
::
getTypeValuesFromString
;
int
ch
;
// set testcase variables
ReduceParam
set
;
const
auto
setReduceDim
=
SetGenericReduceDim
();
while
(
1
)
for
(
std
::
size_t
i
(
0
);
i
<
setReduceDim
.
size
();
++
i
)
{
ch
=
getopt_long
(
argc
,
argv
,
"D:R:S:"
,
long_options
,
&
option_index
);
if
(
ch
==
-
1
)
break
;
switch
(
ch
)
{
case
'D'
:
if
(
!
optarg
)
throw
std
::
runtime_error
(
"Invalid option format!"
);
inLengths
=
getTypeValuesFromString
<
size_t
>
(
optarg
);
break
;
case
'R'
:
if
(
!
optarg
)
throw
std
::
runtime_error
(
"Invalid option format!"
);
reduceDims
=
getTypeValuesFromString
<
int
>
(
optarg
);
break
;
case
'S'
:
if
(
!
optarg
)
throw
std
::
runtime_error
(
"Invalid option format!"
);
scales
=
getTypeValuesFromString
<
float
>
(
optarg
);
break
;
case
'?'
:
if
(
std
::
string
(
long_options
[
option_index
].
name
)
==
"help"
)
{
show_usage
(
argv
[
0
]);
return
(
-
1
);
};
break
;
default:
show_usage
(
argv
[
0
]);
return
(
-
1
);
};
};
if
(
optind
+
2
>
argc
)
throw
std
::
runtime_error
(
"Invalid cmd-line arguments, more argumetns are needed!"
);
data_type
=
std
::
atoi
(
argv
[
optind
++
]);
init_method
=
std
::
atoi
(
argv
[
optind
]);
if
(
scales
.
empty
())
set
.
reduceDims
=
setReduceDim
[
i
];
params
.
emplace_back
(
set
);
}
}
template
<
ReduceTensorOp
ReduceOpIdType
>
void
Run
()
{
for
(
auto
param
:
this
->
params
)
{
scales
.
push_back
(
1.0
f
);
scales
.
push_back
(
0.0
f
);
};
bool
success
=
ck
::
profiler
::
profile_reduce_impl
<
InDataType
,
AccDataType
,
OutDataType
>
(
param
.
do_verification
,
param
.
init_method
,
param
.
do_dumpout
,
param
.
time_kernel
,
param
.
inLengths
,
param
.
reduceDims
,
ReduceOpIdType
,
param
.
propagateNan
,
param
.
useIndex
,
param
.
alpha
,
param
.
beta
);
EXPECT_TRUE
(
success
);
}
}
};
if
(
inLengths
.
size
()
!=
4
||
(
reduceDims
.
size
()
!=
1
&&
reduceDims
.
size
()
!=
3
&&
reduceDims
.
size
()
!=
4
))
return
(
-
1
);
template
<
typename
T
>
std
::
vector
<
ReduceParam
>
ReduceWithIndexTest
<
T
>::
params
=
{};
if
(
data_type
!=
0
&&
data_type
!=
1
&&
data_type
!=
3
&&
data_type
!=
5
&&
data_type
!=
6
)
return
(
-
1
);
using
Reduce_float_types
=
::
testing
::
Types
<
std
::
tuple
<
float
,
float
,
float
>>
;
using
Reduce_double_types
=
::
testing
::
Types
<
std
::
tuple
<
double
,
double
,
double
>>
;
using
Reduce_int8t_types
=
::
testing
::
Types
<
std
::
tuple
<
int8_t
,
int8_t
,
int8_t
>>
;
using
Reduce_half_types
=
::
testing
::
Types
<
std
::
tuple
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
>>
;
using
Reduce_bhalf_float_Types
=
::
testing
::
Types
<
std
::
tuple
<
ck
::
bhalf_t
,
float
,
ck
::
bhalf_t
>>
;
return
(
0
);
};
template
<
typename
TType
>
class
ReduceWithIndexFloat
:
public
ReduceWithIndexTest
<
TType
>
{
};
bool
test_reduce_with_index
(
int
data_type
,
int
init_method
,
std
::
vector
<
int
>
reduceDims
,
std
::
vector
<
size_t
>
inLengths
,
ReduceTensorOp
reduceOpId
,
bool
propagateNan
,
float
alpha
,
float
beta
)
template
<
typename
TType
>
class
ReduceWithIndexDouble
:
public
ReduceWithIndexTest
<
TType
>
{
using
ck
::
profiler
::
profile_reduce_impl
;
}
;
bool
result
=
true
;
template
<
typename
TType
>
class
ReduceWithIndexInt8
:
public
ReduceWithIndexTest
<
TType
>
{
};
if
(
data_type
==
0
)
{
result
=
profile_reduce_impl
<
float
,
float
,
float
>
(
true
,
init_method
,
false
,
false
,
inLengths
,
reduceDims
,
reduceOpId
,
propagateNan
,
true
,
alpha
,
beta
);
}
else
if
(
data_type
==
1
)
{
result
=
profile_reduce_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
>
(
true
,
init_method
,
false
,
false
,
inLengths
,
reduceDims
,
reduceOpId
,
propagateNan
,
true
,
alpha
,
beta
);
}
else
if
(
data_type
==
3
)
{
result
=
profile_reduce_impl
<
int8_t
,
int8_t
,
int8_t
>
(
true
,
init_method
,
false
,
false
,
inLengths
,
reduceDims
,
reduceOpId
,
propagateNan
,
true
,
alpha
,
beta
);
}
else
if
(
data_type
==
5
)
{
result
=
profile_reduce_impl
<
ck
::
bhalf_t
,
float
,
ck
::
bhalf_t
>
(
true
,
init_method
,
false
,
false
,
inLengths
,
reduceDims
,
reduceOpId
,
propagateNan
,
true
,
alpha
,
beta
);
}
else
if
(
data_type
==
6
)
{
result
=
profile_reduce_impl
<
double
,
double
,
double
>
(
true
,
init_method
,
false
,
false
,
inLengths
,
reduceDims
,
reduceOpId
,
propagateNan
,
true
,
alpha
,
beta
);
}
template
<
typename
TType
>
class
ReduceWithIndexHalf
:
public
ReduceWithIndexTest
<
TType
>
{
};
return
(
result
);
template
<
typename
TType
>
class
ReduceWithIndexBHalfFloat
:
public
ReduceWithIndexTest
<
TType
>
{
};
constexpr
ReduceTensorOp
reduceOpId
=
ReduceTensorOp
::
AMAX
;
constexpr
bool
propagateNan
=
false
;
TYPED_TEST_SUITE
(
ReduceWithIndexFloat
,
Reduce_float_types
);
TYPED_TEST_SUITE
(
ReduceWithIndexDouble
,
Reduce_double_types
);
TYPED_TEST_SUITE
(
ReduceWithIndexInt8
,
Reduce_int8t_types
);
TYPED_TEST_SUITE
(
ReduceWithIndexHalf
,
Reduce_half_types
);
TYPED_TEST_SUITE
(
ReduceWithIndexBHalfFloat
,
Reduce_bhalf_float_Types
);
int
main
(
int
argc
,
char
*
argv
[]
)
TYPED_TEST
(
ReduceWithIndexFloat
,
ReduceWithIndexTestFloat_AMAX
)
{
SimpleAppArgs
args
;
// trigger Run() -> Generic
this
->
template
Run
<
ReduceTensorOp
::
AMAX
>();
}
bool
result
=
true
;
TYPED_TEST
(
ReduceWithIndexFloat
,
ReduceWithIndexTestFloat_MIN
)
{
// trigger Run() -> Generic
this
->
template
Run
<
ReduceTensorOp
::
MIN
>();
}
if
(
argc
==
1
)
{
int
data_type
=
1
;
int
init_method
=
2
;
std
::
vector
<
size_t
>
inLengths
{
64
,
4
,
280
,
80
};
std
::
vector
<
std
::
vector
<
int
>>
v_reduceDims
{
{
0
,
1
,
2
,
3
},
{
0
,
1
,
2
},
{
1
,
2
,
3
},
{
0
,
1
,
3
},
{
0
,
2
,
3
},
{
0
},
{
1
},
{
2
},
{
3
}};
for
(
auto
&
reduceDims
:
v_reduceDims
)
result
=
result
&&
test_reduce_with_index
(
data_type
,
init_method
,
reduceDims
,
inLengths
,
reduceOpId
,
propagateNan
,
1.0
f
,
0.0
f
);
}
else
{
if
(
args
.
processArgs
(
argc
,
argv
)
<
0
)
{
throw
std
::
runtime_error
(
"Invalid input arguments, test_reduce_with_index could not be executed!"
);
};
result
=
test_reduce_with_index
(
args
.
data_type
,
args
.
init_method
,
args
.
reduceDims
,
args
.
inLengths
,
reduceOpId
,
propagateNan
,
args
.
scales
[
0
],
args
.
scales
[
1
]
);
}
TYPED_TEST
(
ReduceWithIndexFloat
,
ReduceWithIndexTestFloat_MAX
)
{
// trigger Run() -> Generic
this
->
template
Run
<
ReduceTensorOp
::
MAX
>()
;
}
TYPED_TEST
(
ReduceWithIndexDouble
,
ReduceWithIndexTestDouble_AMAX
)
{
// trigger Run() -> Generic
this
->
template
Run
<
ReduceTensorOp
::
AMAX
>();
}
TYPED_TEST
(
ReduceWithIndexDouble
,
ReduceWithIndexTestDouble_MIN
)
{
// trigger Run() -> Generic
this
->
template
Run
<
ReduceTensorOp
::
MIN
>();
}
TYPED_TEST
(
ReduceWithIndexDouble
,
ReduceWithIndexTestDouble_MAX
)
{
// trigger Run() -> Generic
this
->
template
Run
<
ReduceTensorOp
::
MAX
>();
}
TYPED_TEST
(
ReduceWithIndexInt8
,
ReduceWithIndexTestInt8_AMAX
)
{
// trigger Run() -> Generic
this
->
template
Run
<
ReduceTensorOp
::
AMAX
>();
}
TYPED_TEST
(
ReduceWithIndexInt8
,
ReduceWithIndexTestInt8_MIN
)
{
// trigger Run() -> Generic
this
->
template
Run
<
ReduceTensorOp
::
MIN
>(
);
}
std
::
cout
<<
"test_reduce_with_index ..... "
<<
(
result
?
"SUCCESS"
:
"FAILURE"
)
<<
std
::
endl
;
TYPED_TEST
(
ReduceWithIndexInt8
,
ReduceWithIndexTestInt8_MAX
)
{
// trigger Run() -> Generic
this
->
template
Run
<
ReduceTensorOp
::
MAX
>();
}
TYPED_TEST
(
ReduceWithIndexHalf
,
ReduceWithIndexTestHalf_AMAX
)
{
// trigger Run() -> Generic
this
->
template
Run
<
ReduceTensorOp
::
AMAX
>();
}
TYPED_TEST
(
ReduceWithIndexHalf
,
ReduceWithIndexTestHalf_MIN
)
{
// trigger Run() -> Generic
this
->
template
Run
<
ReduceTensorOp
::
MIN
>();
}
TYPED_TEST
(
ReduceWithIndexHalf
,
ReduceWithIndexTestHalf_MAX
)
{
// trigger Run() -> Generic
this
->
template
Run
<
ReduceTensorOp
::
MAX
>();
}
TYPED_TEST
(
ReduceWithIndexBHalfFloat
,
ReduceWithIndexTesBtHalfFloat_AMAX
)
{
// trigger Run() -> Generic
this
->
template
Run
<
ReduceTensorOp
::
AMAX
>();
}
return
(
result
?
0
:
-
1
);
TYPED_TEST
(
ReduceWithIndexBHalfFloat
,
ReduceWithIndexTestBHalfFloat_MIN
)
{
// trigger Run() -> Generic
this
->
template
Run
<
ReduceTensorOp
::
MIN
>();
}
TYPED_TEST
(
ReduceWithIndexBHalfFloat
,
ReduceWithIndexTestBHalfFloat_MAX
)
{
// trigger Run() -> Generic
this
->
template
Run
<
ReduceTensorOp
::
MAX
>();
}
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment