Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
7a3b49e5
Commit
7a3b49e5
authored
Jun 25, 2022
by
Chao Liu
Browse files
Merge remote-tracking branch 'origin/develop' into contraction
parents
e07b3d8e
d3051d75
Changes
592
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
403 additions
and
375 deletions
+403
-375
example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp
example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp
+16
-13
example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp
example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp
+14
-11
example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp
example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp
+14
-11
example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp
example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp
+14
-11
example/10_conv2d_bwd_data/conv2d_bwd_data_xdl.cpp
example/10_conv2d_bwd_data/conv2d_bwd_data_xdl.cpp
+15
-14
example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp
example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp
+15
-14
example/12_reduce/README.md
example/12_reduce/README.md
+6
-7
example/12_reduce/reduce_blockwise.cpp
example/12_reduce/reduce_blockwise.cpp
+47
-36
example/12_reduce/reduce_blockwise_two_call.cpp
example/12_reduce/reduce_blockwise_two_call.cpp
+61
-50
example/13_pool2d_fwd/pool2d_fwd_common.hpp
example/13_pool2d_fwd/pool2d_fwd_common.hpp
+36
-30
example/13_pool2d_fwd/pool2d_fwd_fp16.cpp
example/13_pool2d_fwd/pool2d_fwd_fp16.cpp
+6
-5
example/13_pool2d_fwd/pool2d_fwd_fp32.cpp
example/13_pool2d_fwd/pool2d_fwd_fp32.cpp
+6
-5
example/14_gemm_xdl_requant_relu_requant/gemm_xdl_requant_relu_requant_int8.cpp
...quant_relu_requant/gemm_xdl_requant_relu_requant_int8.cpp
+15
-15
example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
+15
-15
example/16_gemm_reduce/gemm_reduce_xdl_max_fp16.cpp
example/16_gemm_reduce/gemm_reduce_xdl_max_fp16.cpp
+23
-18
example/16_gemm_reduce/gemm_reduce_xdl_mean_squaremean_fp16.cpp
...e/16_gemm_reduce/gemm_reduce_xdl_mean_squaremean_fp16.cpp
+31
-32
example/17_convnd_bwd_data_xdl/convnd_bwd_data_xdl.cpp
example/17_convnd_bwd_data_xdl/convnd_bwd_data_xdl.cpp
+15
-14
example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp
...e/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp
+30
-29
example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp
example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp
+12
-35
example/19_binary_elementwise/broadcast_add_3d_am_bmnk.cpp
example/19_binary_elementwise/broadcast_add_3d_am_bmnk.cpp
+12
-10
No files found.
example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp
View file @
7a3b49e5
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iostream>
#include <numeric>
#include <type_traits>
#include "c
heck_err
.hpp"
#include "c
onfig
.hpp"
#include "c
onv_util
.hpp"
#include "
device
.hpp"
#include "device_tensor.hpp"
#include "
device_convnd_fwd_xdl_nhwc_kyxc_nhwk
.hpp"
#include "
element_wise_operation
.hpp"
#include "
host_tensor
.hpp"
#include "host_tensor
_generat
or.hpp"
#include "
reference_conv_fwd
.hpp"
#include "
tensor_layout
.hpp"
#include "c
k/ck
.hpp"
#include "c
k/tensor_operation/gpu/device/tensor_layout
.hpp"
#include "c
k/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk
.hpp"
#include "
ck/tensor_operation/gpu/element/element_wise_operation
.hpp"
#include "
ck/library/utility/check_err
.hpp"
#include "
ck/library/utility/conv_util
.hpp"
#include "
ck/library/host_tensor/device_memory
.hpp"
#include "
ck/library/
host_tensor
/host_tens
or.hpp"
#include "
ck/library/host_tensor/host_tensor_generator
.hpp"
#include "
ck/library/reference_tensor_operation/cpu/reference_conv_fwd
.hpp"
namespace
{
...
...
@@ -291,8 +294,8 @@ int main(int argc, char* argv[])
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
conv
->
GetTypeString
()
<<
std
::
endl
;
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
conv
->
GetTypeString
()
<<
std
::
endl
;
if
(
do_verification
)
{
...
...
example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp
View file @
7a3b49e5
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iostream>
#include <numeric>
#include <type_traits>
#include "c
heck_err
.hpp"
#include "c
onfig
.hpp"
#include "c
onv_util
.hpp"
#include "
device
.hpp"
#include "device_tensor.hpp"
#include "
device_convnd_fwd_xdl_nhwc_kyxc_nhwk
.hpp"
#include "
element_wise_operation
.hpp"
#include "
host_tensor
.hpp"
#include "host_tensor
_generat
or.hpp"
#include "
reference_conv_fwd
.hpp"
#include "
tensor_layout
.hpp"
#include "c
k/ck
.hpp"
#include "c
k/tensor_operation/gpu/device/tensor_layout
.hpp"
#include "c
k/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk
.hpp"
#include "
ck/tensor_operation/gpu/element/element_wise_operation
.hpp"
#include "
ck/library/utility/check_err
.hpp"
#include "
ck/library/utility/conv_util
.hpp"
#include "
ck/library/host_tensor/device_memory
.hpp"
#include "
ck/library/
host_tensor
/host_tens
or.hpp"
#include "
ck/library/host_tensor/host_tensor_generator
.hpp"
#include "
ck/library/reference_tensor_operation/cpu/reference_conv_fwd
.hpp"
namespace
{
...
...
example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp
View file @
7a3b49e5
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iostream>
#include <numeric>
#include <type_traits>
#include "c
heck_err
.hpp"
#include "c
onfig
.hpp"
#include "c
onv_util
.hpp"
#include "
device
.hpp"
#include "device_tensor.hpp"
#include "
device_convnd_fwd_xdl_nhwc_kyxc_nhwk
.hpp"
#include "
element_wise_operation
.hpp"
#include "
host_tensor
.hpp"
#include "host_tensor
_generat
or.hpp"
#include "
reference_conv_fwd
.hpp"
#include "
tensor_layout
.hpp"
#include "c
k/ck
.hpp"
#include "c
k/tensor_operation/gpu/device/tensor_layout
.hpp"
#include "c
k/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk
.hpp"
#include "
ck/tensor_operation/gpu/element/element_wise_operation
.hpp"
#include "
ck/library/utility/check_err
.hpp"
#include "
ck/library/utility/conv_util
.hpp"
#include "
ck/library/host_tensor/device_memory
.hpp"
#include "
ck/library/
host_tensor
/host_tens
or.hpp"
#include "
ck/library/host_tensor/host_tensor_generator
.hpp"
#include "
ck/library/reference_tensor_operation/cpu/reference_conv_fwd
.hpp"
namespace
{
...
...
example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp
View file @
7a3b49e5
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iostream>
#include <numeric>
#include <type_traits>
#include "c
heck_err
.hpp"
#include "c
onfig
.hpp"
#include "c
onv_util
.hpp"
#include "
device
.hpp"
#include "device_tensor.hpp"
#include "
device_convnd_fwd_xdl_nhwc_kyxc_nhwk
.hpp"
#include "
element_wise_operation
.hpp"
#include "
host_tensor
.hpp"
#include "host_tensor
_generat
or.hpp"
#include "
reference_conv_fwd
.hpp"
#include "
tensor_layout
.hpp"
#include "c
k/ck
.hpp"
#include "c
k/tensor_operation/gpu/device/tensor_layout
.hpp"
#include "c
k/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk
.hpp"
#include "
ck/tensor_operation/gpu/element/element_wise_operation
.hpp"
#include "
ck/library/utility/check_err
.hpp"
#include "
ck/library/utility/conv_util
.hpp"
#include "
ck/library/host_tensor/device_memory
.hpp"
#include "
ck/library/
host_tensor
/host_tens
or.hpp"
#include "
ck/library/host_tensor/host_tensor_generator
.hpp"
#include "
ck/library/reference_tensor_operation/cpu/reference_conv_fwd
.hpp"
namespace
{
...
...
example/10_conv2d_bwd_data/conv2d_bwd_data_xdl.cpp
View file @
7a3b49e5
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "check_err.hpp"
#include "config.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "device_tensor.hpp"
#include "tensor_layout.hpp"
#include "element_wise_operation.hpp"
#include "device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp"
#include "reference_conv_bwd_data.hpp"
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/conv_util.hpp"
#include "ck/library/host_tensor/device_memory.hpp"
#include "ck/library/host_tensor/host_tensor.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp"
using
InDataType
=
ck
::
half_t
;
using
WeiDataType
=
ck
::
half_t
;
...
...
example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp
View file @
7a3b49e5
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "check_err.hpp"
#include "config.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "device_tensor.hpp"
#include "tensor_layout.hpp"
#include "element_wise_operation.hpp"
#include "device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp"
#include "reference_conv_backward_weight.hpp"
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/conv_util.hpp"
#include "ck/library/host_tensor/device_memory.hpp"
#include "ck/library/host_tensor/host_tensor.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_backward_weight.hpp"
using
InDataType
=
ck
::
half_t
;
using
WeiDataType
=
ck
::
half_t
;
...
...
example/12_reduce/README.md
View file @
7a3b49e5
...
...
@@ -5,14 +5,14 @@
# -D <xxx> : input 4-d tensor lengths
# -v <x> : verification (0=no, 1=yes)
#arg1: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
#arg2: time kernel (0=no, 1=yes)
#arg2: time kernel (0=no, 1=yes)
./bin/example_reduce_blockwise
-D
16,64,32,960
-v
1 1 1
```
Result
```
./bin/example_reduce_blockwise -D 16,64,32,960 -v 1 1 1
launch_and_time_kernel: grid_dim {240, 1, 1}, block_dim {256, 1, 1}
launch_and_time_kernel: grid_dim {240, 1, 1}, block_dim {256, 1, 1}
Warm up 1 time
Start running 10 times...
Perf: 0.282592 ms, 222.641 GB/s, DeviceReduceBlockWise<256,M_C4_S1,K_C64_S1,InSrcVectorDim_0_InSrcVectorSize_1_OutDstVectorSize_1>
...
...
@@ -24,19 +24,18 @@ Perf: 0.282592 ms, 222.641 GB/s, DeviceReduceBlockWise<256,M_C4_S1,K_C64_S1,InSr
```
bash
#arg1: verification (0=no, 1=yes(
#arg2: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
#arg3: time kernel (0=no, 1=yes)
#arg3: time kernel (0=no, 1=yes)
./bin/example_reduce_blockwise_two_call 1 2 1
```
Result
```
./bin/example_reduce_blockwise_two_call 1 2 1
launch_and_time_kernel: grid_dim {204800, 1, 1}, block_dim {256, 1, 1}
launch_and_time_kernel: grid_dim {204800, 1, 1}, block_dim {256, 1, 1}
Warm up 1 time
Start running 10 times...
launch_and_time_kernel: grid_dim {6400, 1, 1}, block_dim {256, 1, 1}
launch_and_time_kernel: grid_dim {6400, 1, 1}, block_dim {256, 1, 1}
Warm up 1 time
Start running 10 times...
Perf: 2.1791 ms, 771.42 GB/s, DeviceReduceBlockWise<256,M_C32_S1,K_C8_S1,InSrcVectorDim_1_InSrcVectorSize_1_OutDstVectorSize_1> => DeviceReduceBlockWise<256,M_C256_S1,K_C1_S1,InSrcVectorDim_1_InSrcVectorSize_1_OutDstVectorSize_1>
```
example/12_reduce/reduce_blockwise.cpp
View file @
7a3b49e5
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <getopt.h>
#include "check_err.hpp"
#include "config.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "device_tensor.hpp"
#include "device_base.hpp"
#include "device_reduce_multiblock.hpp"
#include "host_common_util.hpp"
#include "host_reduction.hpp"
#include "reduction_enums.hpp"
#include "reduction_operator_mapping.hpp"
#include "ck/ck.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/host_tensor/device_memory.hpp"
#include "ck/library/host_tensor/host_tensor.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp"
#include "ck/library/host_tensor/host_common_util.hpp"
#include "ck/library/host_tensor/host_reduction.hpp"
using
namespace
ck
;
using
namespace
ck
::
tensor_operation
::
device
;
...
...
@@ -33,11 +33,11 @@ constexpr ReduceTensorOp ReduceOpId = ReduceTensorOp::NORM2;
constexpr
bool
PropagateNan
=
true
;
constexpr
bool
OutputIndex
=
false
;
using
ReduceOperation
=
typename
reduce_binary_operator
<
AccDataType
,
ReduceOpId
>::
opType
;
using
ReduceOperation
=
typename
reduce_binary_operator
<
ReduceOpId
>::
opType
;
using
InElementwiseOperation
=
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
InElementwiseOperation
;
typename
reduce_unary_operator
<
ReduceOpId
,
true
,
true
>::
InElementwiseOperation
;
using
AccElementwiseOperation
=
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
AccElementwiseOperation
;
typename
reduce_unary_operator
<
ReduceOpId
,
true
,
true
>::
AccElementwiseOperation
;
using
DeviceReduceInstance
=
DeviceReduceMultiBlock
<
InDataType
,
AccDataType
,
...
...
@@ -147,8 +147,6 @@ class SimpleAppArgs
int
main
(
int
argc
,
char
*
argv
[])
{
using
namespace
ck
::
host_reduce
;
const
std
::
vector
<
int
>
reduceDims
{
0
,
1
,
2
};
const
std
::
vector
<
int
>
invariantDims
{
3
};
...
...
@@ -249,20 +247,34 @@ int main(int argc, char* argv[])
DeviceMem
out_index_dev
(
indicesSizeInBytes
);
InElementwiseOperation
in_elementwise_op
;
AccElementwiseOperation
acc_elementwise_op
;
std
::
tie
(
in_elementwise_op
,
acc_elementwise_op
)
=
reduce_unary_operator
<
ReduceOpId
,
true
,
true
>::
GetElementwiseOperator
(
static_cast
<
int32_t
>
(
reduce_total_length
));
if
(
args
.
do_verification
)
{
ReductionHost
<
InDataType
,
AccDataType
,
OutDataType
,
ReduceOpId
,
ReduceOperation
,
InElementwiseOperation
,
AccElementwiseOperation
,
Rank
,
NumReduceDim
,
PropagateNan
,
OutputIndex
>
hostReduce
(
in
.
mDesc
,
out_ref
.
mDesc
,
invariantDims
,
reduceDims
);
hostReduce
.
Run
(
alpha
,
in
.
mData
.
data
(),
beta
,
out_ref
.
mData
.
data
(),
out_indices_ref
.
mData
.
data
());
hostReduce
.
Run
(
alpha
,
in
.
mData
.
data
(),
beta
,
out_ref
.
mData
.
data
(),
out_indices_ref
.
mData
.
data
(),
in_elementwise_op
,
acc_elementwise_op
);
};
std
::
vector
<
ck
::
index_t
>
i_inLengths
;
...
...
@@ -277,20 +289,19 @@ int main(int argc, char* argv[])
auto
reduce
=
DeviceReduceInstance
{};
auto
argument_ptr
=
reduce
.
MakeArgumentPointer
(
i_inLengths
,
i_inStrides
,
i_outLengths
,
i_outStrides
,
reduceDims
,
alpha
,
beta
,
in_dev
.
GetDeviceBuffer
(),
nullptr
,
out_dev
.
GetDeviceBuffer
(),
out_index_dev
.
GetDeviceBuffer
(),
InElementwiseOperation
{
static_cast
<
int32_t
>
(
reduce_total_length
)},
AccElementwiseOperation
{
static_cast
<
int32_t
>
(
reduce_total_length
)});
auto
argument_ptr
=
reduce
.
MakeArgumentPointer
(
i_inLengths
,
i_inStrides
,
i_outLengths
,
i_outStrides
,
reduceDims
,
alpha
,
beta
,
in_dev
.
GetDeviceBuffer
(),
nullptr
,
out_dev
.
GetDeviceBuffer
(),
out_index_dev
.
GetDeviceBuffer
(),
in_elementwise_op
,
acc_elementwise_op
);
if
(
!
reduce
.
IsSupportedArgument
(
argument_ptr
.
get
()))
{
...
...
example/12_reduce/reduce_blockwise_two_call.cpp
View file @
7a3b49e5
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <sstream>
...
...
@@ -5,20 +8,17 @@
#include <cstdlib>
#include <getopt.h>
#include "check_err.hpp"
#include "config.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "device_tensor.hpp"
#include "device_base.hpp"
#include "device_reduce_multiblock.hpp"
#include "host_common_util.hpp"
#include "host_reduction.hpp"
#include "reduction_enums.hpp"
#include "reduction_operator_mapping.hpp"
#include "ck/ck.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/host_tensor/device_memory.hpp"
#include "ck/library/host_tensor/host_tensor.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp"
#include "ck/library/host_tensor/host_common_util.hpp"
#include "ck/library/host_tensor/host_reduction.hpp"
using
namespace
ck
;
using
namespace
ck
::
tensor_operation
::
device
;
...
...
@@ -31,13 +31,13 @@ constexpr ReduceTensorOp ReduceOpId = ReduceTensorOp::NORM2;
constexpr
bool
PropagateNan
=
true
;
constexpr
bool
OutputIndex
=
false
;
using
ReduceOperation
=
typename
reduce_binary_operator
<
AccDataType
,
ReduceOpId
>::
opType
;
using
ReduceOperation
=
typename
reduce_binary_operator
<
ReduceOpId
>::
opType
;
using
InElementwiseOperation
=
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
InElementwiseOperation
;
typename
reduce_unary_operator
<
ReduceOpId
,
true
,
true
>::
InElementwiseOperation
;
using
AccElementwiseOperation
=
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
AccElementwiseOperation
;
typename
reduce_unary_operator
<
ReduceOpId
,
true
,
true
>::
AccElementwiseOperation
;
using
PassThroughOp
=
tensor_operation
::
element_wise
::
UnaryIdentic
<
AccDataType
,
AccDataType
>
;
using
PassThroughOp
=
tensor_operation
::
element_wise
::
PassThrough
;
using
DeviceReduceInstance_1
=
DeviceReduceMultiBlock
<
InOutDataType
,
AccDataType
,
...
...
@@ -108,8 +108,6 @@ int main(int argc, char* argv[])
const
std
::
vector
<
size_t
>
outLengths
=
{
64
,
320
,
80
};
using
namespace
ck
::
host_reduce
;
if
(
argc
==
1
)
{
do_verify
=
true
;
...
...
@@ -186,19 +184,34 @@ int main(int argc, char* argv[])
if
(
beta
!=
0.0
f
)
out_dev
.
ToDevice
(
out
.
mData
.
data
());
InElementwiseOperation
in_elementwise_op
;
AccElementwiseOperation
acc_elementwise_op
;
std
::
tie
(
in_elementwise_op
,
acc_elementwise_op
)
=
reduce_unary_operator
<
ReduceOpId
,
true
,
true
>::
GetElementwiseOperator
(
static_cast
<
int32_t
>
(
reduce_total_length
));
if
(
do_verify
)
{
ReductionHost
<
InOutDataType
,
AccDataType
,
InOutDataType
,
ReduceOpId
,
ReduceOperation
,
InElementwiseOperation
,
AccElementwiseOperation
,
5
,
// Rank
2
,
// NumReduceDim
PropagateNan
,
OutputIndex
>
hostReduce
(
in_1
.
mDesc
,
out_ref
.
mDesc
,
invariantDims
,
reduceDims
);
hostReduce
.
Run
(
alpha
,
in_1
.
mData
.
data
(),
beta
,
out_ref
.
mData
.
data
(),
nullptr
);
hostReduce
.
Run
(
alpha
,
in_1
.
mData
.
data
(),
beta
,
out_ref
.
mData
.
data
(),
nullptr
,
in_elementwise_op
,
acc_elementwise_op
);
};
std
::
vector
<
ck
::
index_t
>
i_inLengths_1
;
...
...
@@ -217,20 +230,19 @@ int main(int argc, char* argv[])
auto
reduce_1
=
DeviceReduceInstance_1
{};
auto
argument_ptr_1
=
reduce_1
.
MakeArgumentPointer
(
i_inLengths_1
,
i_inStrides_1
,
i_inLengths_2
,
i_inStrides_2
,
reduceDims_1
,
1.0
f
,
0.0
f
,
in_1_dev
.
GetDeviceBuffer
(),
nullptr
,
in_2_dev
.
GetDeviceBuffer
(),
nullptr
,
InElementwiseOperation
{
static_cast
<
int32_t
>
(
reduce_total_length
)},
PassThroughOp
{});
auto
argument_ptr_1
=
reduce_1
.
MakeArgumentPointer
(
i_inLengths_1
,
i_inStrides_1
,
i_inLengths_2
,
i_inStrides_2
,
reduceDims_1
,
1.0
f
,
0.0
f
,
in_1_dev
.
GetDeviceBuffer
(),
nullptr
,
in_2_dev
.
GetDeviceBuffer
(),
nullptr
,
in_elementwise_op
,
PassThroughOp
{});
if
(
!
reduce_1
.
IsSupportedArgument
(
argument_ptr_1
.
get
()))
{
...
...
@@ -243,20 +255,19 @@ int main(int argc, char* argv[])
auto
reduce_2
=
DeviceReduceInstance_2
{};
auto
argument_ptr_2
=
reduce_2
.
MakeArgumentPointer
(
i_inLengths_2
,
i_inStrides_2
,
i_outLengths
,
i_outStrides
,
reduceDims_2
,
alpha
,
beta
,
in_2_dev
.
GetDeviceBuffer
(),
nullptr
,
out_dev
.
GetDeviceBuffer
(),
nullptr
,
PassThroughOp
{},
AccElementwiseOperation
{
static_cast
<
int32_t
>
(
reduce_total_length
)});
auto
argument_ptr_2
=
reduce_2
.
MakeArgumentPointer
(
i_inLengths_2
,
i_inStrides_2
,
i_outLengths
,
i_outStrides
,
reduceDims_2
,
alpha
,
beta
,
in_2_dev
.
GetDeviceBuffer
(),
nullptr
,
out_dev
.
GetDeviceBuffer
(),
nullptr
,
PassThroughOp
{},
acc_elementwise_op
);
if
(
!
reduce_2
.
IsSupportedArgument
(
argument_ptr_2
.
get
()))
{
...
...
example/13_pool2d_fwd/pool2d_fwd_common.hpp
View file @
7a3b49e5
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include "c
heck_err
.hpp"
#include "c
onfig
.hpp"
#include "
print
.hpp"
#include "
device
.hpp"
#include "
host_
tensor.hpp"
#include "
host_
tensor_
gen
erat
or
.hpp"
#include "host_reduce_util.hpp"
#include "
device_tenso
r.hpp"
#include "
tensor_layout
.hpp"
#include "
reduction_enums
.hpp"
#include "
device_pool2d_fwd_nhwc_nhwc
.hpp"
#include "c
k/ck
.hpp"
#include "c
k/utility/reduction_enums
.hpp"
#include "
ck/utility/reduction_functions_accumulate
.hpp"
#include "
ck/tensor_operation/gpu/device/reduction_operator_mapping
.hpp"
#include "
ck/
tensor
_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc
.hpp"
#include "
ck/
tensor_
operation/gpu/element/element_wise_op
erat
ion
.hpp"
#include "
ck/library/utility/check_er
r.hpp"
#include "
ck/library/host_tensor/device_memory
.hpp"
#include "
ck/library/host_tensor/host_tensor
.hpp"
#include "
ck/library/host_tensor/host_tensor_generator
.hpp"
template
<
typename
InDataType
,
typename
OutDataType
,
...
...
@@ -29,19 +32,23 @@ static void pool_host_verify(const Tensor<InDataType>& in,
const
std
::
array
<
ck
::
index_t
,
2
>&
in_left_pads
,
const
std
::
array
<
ck
::
index_t
,
2
>&
/*in_right_pads*/
)
{
using
namespace
ck
::
host_reduce
;
const
int32_t
reduceLength
=
window_spatial_lengths
[
0
]
*
window_spatial_lengths
[
1
];
using
ReduceOperation
=
typename
ck
::
reduce_binary_operator
<
ReduceOpId
>::
opType
;
const
int32_t
divider
=
window_spatial_lengths
[
0
]
*
window_spatial_lengths
[
1
];
auto
elementwise_ops
=
ck
::
reduce_unary_operator
<
ReduceOpId
,
true
,
true
>::
GetElementwiseOperator
(
reduceLength
);
const
auto
PreUnaryOp
=
PreUnaryOpFn
<
AccDataType
,
ReduceOpId
>
(
divider
);
const
auto
PosUnaryOp
=
PosUnaryOpFn
<
AccDataType
,
ReduceOpId
>
(
divider
);
auto
in_elementwise_op
=
std
::
get
<
0
>
(
elementwise_ops
);
auto
acc_elementwise_op
=
std
::
get
<
1
>
(
elementwise_ops
);
if
constexpr
(
!
OutputIndex
)
{
auto
opReduce
=
ReduceOpFn
<
AccDataType
,
ReduceOpId
>
();
using
Accumulation
=
ck
::
detail
::
AccumulateWithNanCheck
<
PropagateNan
,
ReduceOperation
,
AccDataType
>
;
auto
f_nchw
=
[
&
](
auto
n
,
auto
c
,
auto
ho
,
auto
wo
)
{
auto
accuVal
=
ReduceOp
Z
er
o
Val
<
AccDataType
,
ReduceOpId
>
();
auto
accuVal
=
ReduceOper
ation
::
template
GetIdentity
Val
ue
<
AccDataType
>();
for
(
ck
::
index_t
y
=
0
;
y
<
window_spatial_lengths
[
0
];
++
y
)
{
...
...
@@ -54,14 +61,14 @@ static void pool_host_verify(const Tensor<InDataType>& in,
{
AccDataType
currVal
=
static_cast
<
AccDataType
>
(
in
(
n
,
c
,
hi
,
wi
));
PreUnaryOp
(
currVal
);
in_elementwise_op
(
currVal
,
currVal
);
binop_with_nan_check
<
AccDataType
,
PropagateNan
>
(
opReduce
,
accuVal
,
currVal
);
Accumulation
::
Calculate
(
accuVal
,
currVal
);
}
}
}
PosUnaryOp
(
accuVal
);
acc_elementwise_op
(
accuVal
,
accuVal
);
out
(
n
,
c
,
ho
,
wo
)
=
accuVal
;
};
...
...
@@ -74,10 +81,12 @@ static void pool_host_verify(const Tensor<InDataType>& in,
}
else
{
auto
opReduce
=
ReduceOpFn2
<
AccDataType
,
ReduceOpId
>
();
auto
f_nchw
=
[
&
](
auto
n
,
auto
c
,
auto
ho
,
auto
wo
)
{
auto
accuVal
=
ReduceOpZeroVal
<
AccDataType
,
ReduceOpId
>
();
using
Accumulation
=
ck
::
detail
::
AccumulateWithIndexAndNanCheck
<
PropagateNan
,
ReduceOperation
,
AccDataType
,
IndexDataType
>
;
auto
f_nchw
=
[
&
](
auto
n
,
auto
c
,
auto
ho
,
auto
wo
)
{
auto
accuVal
=
ReduceOperation
::
template
GetIdentityValue
<
AccDataType
>();
IndexDataType
accuIndex
=
0
;
for
(
ck
::
index_t
y
=
0
;
y
<
window_spatial_lengths
[
0
];
++
y
)
...
...
@@ -92,15 +101,14 @@ static void pool_host_verify(const Tensor<InDataType>& in,
AccDataType
currVal
=
static_cast
<
AccDataType
>
(
in
(
n
,
c
,
hi
,
wi
));
IndexDataType
currIndex
=
y
*
window_spatial_lengths
[
1
]
+
x
;
PreUnaryOp
(
currVal
);
in_elementwise_op
(
currVal
,
currVal
);
binop_with_index_and_nan_check
<
AccDataType
,
IndexDataType
,
PropagateNan
>
(
opReduce
,
accuVal
,
currVal
,
accuIndex
,
currIndex
);
Accumulation
::
Calculate
(
accuVal
,
currVal
,
accuIndex
,
currIndex
);
}
}
}
PosUnaryOp
(
accuVal
);
acc_elementwise_op
(
accuVal
,
accuVal
);
out
(
n
,
c
,
ho
,
wo
)
=
accuVal
;
out_indices
(
n
,
c
,
ho
,
wo
)
=
accuIndex
;
...
...
@@ -139,8 +147,6 @@ bool pool_test(bool do_verification,
ck
::
index_t
in_right_pad_h
,
ck
::
index_t
in_right_pad_w
)
{
using
namespace
ck
::
host_reduce
;
using
DevicePoolFwdInstance
=
ck
::
tensor_operation
::
device
::
DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C
<
InDataType
,
// InDataType
...
...
example/13_pool2d_fwd/pool2d_fwd_fp16.cpp
View file @
7a3b49e5
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <cstdlib>
#include "c
onfig
.hpp"
#include "tensor_layout.hpp"
#include "reduction_enums.hpp"
#include "c
k/ck
.hpp"
#include "
ck/tensor_operation/gpu/device/
tensor_layout.hpp"
#include "
ck/utility/
reduction_enums.hpp"
#include "pool2d_fwd_common.hpp"
...
...
@@ -27,8 +30,6 @@ static constexpr bool PropagateNan = false;
int
main
(
int
argc
,
char
*
argv
[])
{
using
namespace
ck
::
host_reduce
;
bool
do_verification
;
int
init_method
;
bool
time_kernel
;
...
...
example/13_pool2d_fwd/pool2d_fwd_fp32.cpp
View file @
7a3b49e5
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <cstdlib>
#include "c
onfig
.hpp"
#include "
tensor_layout
.hpp"
#include "
reduction_enums
.hpp"
#include "c
k/ck
.hpp"
#include "
ck/utility/reduction_enums
.hpp"
#include "
ck/tensor_operation/gpu/device/tensor_layout
.hpp"
#include "pool2d_fwd_common.hpp"
...
...
@@ -27,8 +30,6 @@ static constexpr bool PropagateNan = false;
int
main
(
int
argc
,
char
*
argv
[])
{
using
namespace
ck
::
host_reduce
;
bool
do_verification
;
int
init_method
;
bool
time_kernel
;
...
...
example/14_gemm_xdl_requant_relu_requant/gemm_xdl_requant_relu_requant_int8.cpp
View file @
7a3b49e5
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "check_err.hpp"
#include "config.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "host_gemm.hpp"
#include "device_tensor.hpp"
#include "device_gemm_xdl_cshuffle.hpp"
#include "element_wise_operation.hpp"
#include "reference_gemm.hpp"
#include "gemm_specialization.hpp"
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/host_tensor/device_memory.hpp"
#include "ck/library/host_tensor/host_tensor.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp"
struct
RequantReluRequant
{
...
...
example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
View file @
7a3b49e5
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "check_err.hpp"
#include "config.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "host_gemm.hpp"
#include "device_tensor.hpp"
#include "device_grouped_gemm_xdl.hpp"
#include "element_wise_operation.hpp"
#include "reference_gemm.hpp"
#include "gemm_specialization.hpp"
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/host_tensor/device_memory.hpp"
#include "ck/library/host_tensor/host_tensor.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
...
...
example/16_gemm_reduce/gemm_reduce_xdl_max_fp16.cpp
View file @
7a3b49e5
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include "check_err.hpp"
#include "config.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "device_tensor.hpp"
#include "device_gemm_reduce_xdl_cshuffle.hpp"
#include "element_wise_operation.hpp"
#include "reference_gemm.hpp"
#include "gemm_specialization.hpp"
#include "element_wise_reduce_operation.hpp"
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_reduce_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/host_tensor/device_memory.hpp"
#include "ck/library/host_tensor/host_tensor.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp"
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
...
...
@@ -41,9 +43,8 @@ using CLayout = ck::tensor_layout::gemm::RowMajor;
using
AElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
BElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
CElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
DsReduceOp
=
ck
::
Tuple
<
ck
::
reduce
::
Max
<
ReduceAccDataType
>>
;
using
DsElementOp
=
ck
::
Tuple
<
ck
::
tensor_operation
::
element_wise
::
UnaryIdentic
<
ReduceAccDataType
,
ReduceAccDataType
,
false
>>
;
using
DsReduceOp
=
ck
::
Tuple
<
ck
::
reduce
::
Max
>
;
using
DsElementOp
=
ck
::
Tuple
<
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
;
using
DGlobalMemOp
=
ck
::
InMemoryDataOperationEnumSequence
<
ck
::
InMemoryDataOperationEnum
::
AtomicMax
>
;
...
...
@@ -236,10 +237,14 @@ int main(int argc, char* argv[])
for
(
int
m
=
0
;
m
<
M
;
++
m
)
{
ReduceAccDataType
d_acc
=
d_reduce_op
.
Get
ReductionZeroVal
();
ReduceAccDataType
d_acc
=
d_reduce_op
.
Get
IdentityValue
<
ReduceAccDataType
>
();
for
(
int
n
=
0
;
n
<
N
;
++
n
)
d_reduce_op
(
d_acc
,
c_m_n_host_result
(
m
,
n
));
{
ReduceAccDataType
curr_val
=
ck
::
type_convert
<
ReduceAccDataType
>
(
c_m_n_host_result
(
m
,
n
));
d_reduce_op
(
d_acc
,
curr_val
);
};
d_m_host_result
(
m
)
=
d_acc
;
}
...
...
example/16_gemm_reduce/gemm_reduce_xdl_mean_squaremean_fp16.cpp
View file @
7a3b49e5
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include "check_err.hpp"
#include "config.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "device_tensor.hpp"
#include "device_gemm_reduce_xdl_cshuffle.hpp"
#include "element_wise_operation.hpp"
#include "reduction_operator.hpp"
#include "reference_gemm.hpp"
#include "gemm_specialization.hpp"
#include "reduction_operator.hpp"
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_reduce_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/utility/reduction_operator.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/host_tensor/device_memory.hpp"
#include "ck/library/host_tensor/host_tensor.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
...
...
@@ -41,18 +43,15 @@ using CLayout = ck::tensor_layout::gemm::RowMajor;
using
AElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
BElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
CElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
D0ReduceOp
=
ck
::
reduce
::
Add
<
ReduceAccDataType
>
;
using
D1ReduceOp
=
ck
::
reduce
::
Add
<
ReduceAccDataType
>
;
using
D0ReduceOp
=
ck
::
reduce
::
Add
;
using
D1ReduceOp
=
ck
::
reduce
::
Add
;
using
DxsReduceOp
=
ck
::
Tuple
<
D0ReduceOp
,
D1ReduceOp
>
;
using
UnaryIdenticElementOp
=
ck
::
tensor_operation
::
element_wise
::
UnaryIdentic
<
ReduceAccDataType
,
ReduceAccDataType
,
false
>
;
using
UnaryDivElementOp
=
ck
::
tensor_operation
::
element_wise
::
UnaryIdentic
<
ReduceAccDataType
,
ReduceAccDataType
,
true
>
;
using
UnarySquareElementOp
=
ck
::
tensor_operation
::
element_wise
::
UnarySquare
<
ReduceAccDataType
,
ReduceAccDataType
,
false
>
;
using
DxsInElementOp
=
ck
::
Tuple
<
UnaryIdenticElementOp
,
UnarySquareElementOp
>
;
using
DxsOutElementOp
=
ck
::
Tuple
<
UnaryDivElementOp
,
UnaryDivElementOp
>
;
using
UnaryIdenticElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
UnaryDivElementOp
=
ck
::
tensor_operation
::
element_wise
::
UnaryDivide
;
using
UnarySquareElementOp
=
ck
::
tensor_operation
::
element_wise
::
UnarySquare
;
using
DxsInElementOps
=
ck
::
Tuple
<
UnaryIdenticElementOp
,
UnarySquareElementOp
>
;
using
DxsOutElementOps
=
ck
::
Tuple
<
UnaryDivElementOp
,
UnaryDivElementOp
>
;
using
DGlobalMemOp
=
ck
::
InMemoryDataOperationEnumSequence
<
ck
::
InMemoryDataOperationEnum
::
AtomicAdd
,
...
...
@@ -67,7 +66,7 @@ using DeviceGemmReduceInstance = ck::tensor_operation::device::DeviceGemmReduce_
//######| | | | Type| Type| Type| DataType| DataType| DataType| Type Tuple| Elementwise| Elementwise| Elementwise| Reduce| | | MemoryData| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MPerBlock| ScalarPerVector| ThreadClusterLengths| SrcDstScalarPerVector| SrcDstScalarPerVector|
//######| | | | | | | | | | | Operation| Operation| Operation| Operation| | | Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NPerBlock| _NPerBlock| _MPerBlock_NPerBlock| _NPerBlock| _MPerBlock|
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
<
Row
,
Col
,
Row
,
F16
,
F16
,
F16
,
F32
,
F32
,
F32
,
DPtrsGlobal
,
AElementOp
,
BElementOp
,
CElementOp
,
DxsReduceOp
,
DxsInElementOp
,
DxsOutElementOp
,
DGlobalMemOp
,
GemmSpecialization
,
1
,
256
,
256
,
128
,
32
,
8
,
8
,
32
,
32
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
,
S
<
64
,
4
>
,
4
,
1
>
;
<
Row
,
Col
,
Row
,
F16
,
F16
,
F16
,
F32
,
F32
,
F32
,
DPtrsGlobal
,
AElementOp
,
BElementOp
,
CElementOp
,
DxsReduceOp
,
DxsInElementOp
s
,
DxsOutElementOp
s
,
DGlobalMemOp
,
GemmSpecialization
,
1
,
256
,
256
,
128
,
32
,
8
,
8
,
32
,
32
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
,
S
<
64
,
4
>
,
4
,
1
>
;
// clang-format on
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGemm
<
ADataType
,
...
...
@@ -204,8 +203,8 @@ int main(int argc, char* argv[])
auto
dxs_global
=
ck
::
make_tuple
(
static_cast
<
DDataType
*>
(
d0_device_buf
.
GetDeviceBuffer
()),
static_cast
<
DDataType
*>
(
d1_device_buf
.
GetDeviceBuffer
()));
auto
dxs_in_element_op
=
DxsInElementOp
{};
auto
dxs_out_element_op
=
DxsOutElementOp
{
M
,
M
};
auto
dxs_in_element_op
=
DxsInElementOp
s
{};
auto
dxs_out_element_op
=
DxsOutElementOp
s
{
N
,
N
};
// do GEMM
auto
gemm
=
DeviceGemmReduceInstance
{};
...
...
@@ -261,14 +260,14 @@ int main(int argc, char* argv[])
for
(
int
m
=
0
;
m
<
M
;
++
m
)
{
float
d0_acc
=
d0_reduce_op
.
Get
ReductionZeroVal
();
float
d1_acc
=
d1_reduce_op
.
Get
ReductionZeroVal
();
auto
d0_acc
=
d0_reduce_op
.
Get
IdentityValue
<
ReduceAccDataType
>
();
auto
d1_acc
=
d1_reduce_op
.
Get
IdentityValue
<
ReduceAccDataType
>
();
for
(
int
n
=
0
;
n
<
N
;
++
n
)
{
float
c_val
=
ck
::
type_convert
<
float
>
(
c_m_n_host_result
(
m
,
n
));
float
d0_val
=
0
;
float
d1_val
=
0
;
auto
c_val
=
ck
::
type_convert
<
ReduceAccDataType
>
(
c_m_n_host_result
(
m
,
n
));
ReduceAccDataType
d0_val
;
ReduceAccDataType
d1_val
;
dxs_in_element_op
(
ck
::
Number
<
0
>
{})(
d0_val
,
c_val
);
dxs_in_element_op
(
ck
::
Number
<
1
>
{})(
d1_val
,
c_val
);
...
...
example/17_convnd_bwd_data_xdl/convnd_bwd_data_xdl.cpp
View file @
7a3b49e5
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "config.hpp"
#include "conv_util.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "device_tensor.hpp"
#include "tensor_layout.hpp"
#include "element_wise_operation.hpp"
#include "device_convnd_bwd_data_xdl_ndhwc_kzyxc_ndhwk.hpp"
#include "reference_conv_bwd_data.hpp"
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_convnd_bwd_data_xdl_ndhwc_kzyxc_ndhwk.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/conv_util.hpp"
#include "ck/library/host_tensor/device_memory.hpp"
#include "ck/library/host_tensor/host_tensor.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp"
using
InDataType
=
ck
::
half_t
;
using
WeiDataType
=
ck
::
half_t
;
...
...
example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp
View file @
7a3b49e5
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "check_err.hpp"
#include "config.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "device_tensor.hpp"
#include "device_batched_gemm_reduce_xdl_cshuffle.hpp"
#include "element_wise_operation.hpp"
#include "reduction_operator.hpp"
#include "reference_batched_gemm.hpp"
#include "gemm_specialization.hpp"
#include "ck/ck.hpp"
#include "ck/utility/reduction_operator.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/host_tensor/device_memory.hpp"
#include "ck/library/host_tensor/host_tensor.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
...
...
@@ -39,16 +41,14 @@ using CLayout = ck::tensor_layout::gemm::RowMajor;
using
AElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
BElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
CElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
D0ReduceOp
=
ck
::
reduce
::
Add
<
ReduceAccDataType
>
;
using
D1ReduceOp
=
ck
::
reduce
::
Add
<
ReduceAccDataType
>
;
using
D0ReduceOp
=
ck
::
reduce
::
Add
;
using
D1ReduceOp
=
ck
::
reduce
::
Add
;
using
DxsReduceOp
=
ck
::
Tuple
<
D0ReduceOp
,
D1ReduceOp
>
;
using
UnaryIdenticElementOp
=
ck
::
tensor_operation
::
element_wise
::
UnaryIdentic
<
ReduceAccDataType
,
ReduceAccDataType
,
false
>
;
using
UnarySquareElementOp
=
ck
::
tensor_operation
::
element_wise
::
UnarySquare
<
ReduceAccDataType
,
ReduceAccDataType
,
false
>
;
using
DxsInElementOp
=
ck
::
Tuple
<
UnaryIdenticElementOp
,
UnarySquareElementOp
>
;
using
DxsOutElementOp
=
ck
::
Tuple
<
UnaryIdenticElementOp
,
UnaryIdenticElementOp
>
;
using
UnaryIdenticElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
UnarySquareElementOp
=
ck
::
tensor_operation
::
element_wise
::
UnarySquare
;
using
DxsInElementOps
=
ck
::
Tuple
<
UnaryIdenticElementOp
,
UnarySquareElementOp
>
;
using
DxsOutElementOps
=
ck
::
Tuple
<
UnaryIdenticElementOp
,
UnaryIdenticElementOp
>
;
using
DGlobalMemOp
=
ck
::
InMemoryDataOperationEnumSequence
<
ck
::
InMemoryDataOperationEnum
::
AtomicAdd
,
...
...
@@ -63,7 +63,7 @@ using DeviceBatchedGemmReduceInstance = ck::tensor_operation::device::DeviceBatc
//######| | | | Type| Type| Type| DataType| DataType| DataType| Type Tuple| Elementwise| Elementwise| Elementwise| Reduce| | | MemoryData| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MPerBlock| ScalarPerVector| ThreadClusterLengths| SrcDstScalarPerVector| SrcDstScalarPerVector|
//######| | | | | | | | | | | Operation| Operation| Operation| Operation| | | Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NPerBlock| _NPerBlock| _MPerBlock_NPerBlock| _NPerBlock| _MPerBlock|
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
<
Row
,
Col
,
Row
,
F16
,
F16
,
F16
,
F32
,
F32
,
F32
,
DPtrsGlobal
,
AElementOp
,
BElementOp
,
CElementOp
,
DxsReduceOp
,
DxsInElementOp
,
DxsOutElementOp
,
DGlobalMemOp
,
GemmSpecialization
,
1
,
256
,
256
,
128
,
32
,
8
,
8
,
32
,
32
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
,
S
<
64
,
4
>
,
4
,
1
>
;
<
Row
,
Col
,
Row
,
F16
,
F16
,
F16
,
F32
,
F32
,
F32
,
DPtrsGlobal
,
AElementOp
,
BElementOp
,
CElementOp
,
DxsReduceOp
,
DxsInElementOp
s
,
DxsOutElementOp
s
,
DGlobalMemOp
,
GemmSpecialization
,
1
,
256
,
256
,
128
,
32
,
8
,
8
,
32
,
32
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
,
S
<
64
,
4
>
,
4
,
1
>
;
// clang-format on
using
ReferenceBatchedGemmInstance
=
ck
::
tensor_operation
::
host
::
...
...
@@ -206,8 +206,8 @@ int main(int argc, char* argv[])
a_element_op
,
b_element_op
,
c_element_op
,
DxsInElementOp
{},
DxsOutElementOp
{},
DxsInElementOp
s
{},
DxsOutElementOp
s
{},
BatchCount
);
if
(
!
batched_gemm
.
IsSupportedArgument
(
argument
))
...
...
@@ -259,14 +259,15 @@ int main(int argc, char* argv[])
{
for
(
int
m
=
0
;
m
<
M
;
++
m
)
{
float
d0_acc
=
d0_reduce_op
.
Get
ReductionZeroVal
();
float
d1_acc
=
d1_reduce_op
.
Get
ReductionZeroVal
();
auto
d0_acc
=
d0_reduce_op
.
Get
IdentityValue
<
ReduceAccDataType
>
();
auto
d1_acc
=
d1_reduce_op
.
Get
IdentityValue
<
ReduceAccDataType
>
();
for
(
int
n
=
0
;
n
<
N
;
++
n
)
{
float
c_val
=
ck
::
type_convert
<
float
>
(
c_g_m_n_host_result
(
batch
,
m
,
n
));
float
d0_val
=
0
;
float
d1_val
=
0
;
auto
c_val
=
ck
::
type_convert
<
ReduceAccDataType
>
(
c_g_m_n_host_result
(
batch
,
m
,
n
));
ReduceAccDataType
d0_val
;
ReduceAccDataType
d1_val
;
UnaryIdenticElementOp
{}(
d0_val
,
c_val
);
UnarySquareElementOp
{}(
d1_val
,
c_val
);
...
...
example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp
View file @
7a3b49e5
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2022 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <cstdlib>
#include "check_err.hpp"
#include "config.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "device_tensor.hpp"
#include "binary_element_wise_operation.hpp"
#include "device_binary_elementwise.hpp"
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/device_binary_elementwise.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/host_tensor/device_memory.hpp"
#include "ck/library/host_tensor/host_tensor.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp"
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
...
...
@@ -42,8 +20,7 @@ using ABDataType = F16;
using
CDataType
=
F16
;
using
EltwiseComputeDataType
=
F32
;
using
Add
=
ck
::
tensor_operation
::
binary_element_wise
::
Add
<
EltwiseComputeDataType
,
EltwiseComputeDataType
,
EltwiseComputeDataType
>
;
using
Add
=
ck
::
tensor_operation
::
element_wise
::
Add
;
using
DeviceElementwiseAddInstance
=
ck
::
tensor_operation
::
device
::
DeviceBinaryElementwise
<
ABDataType
,
...
...
example/19_binary_elementwise/broadcast_add_3d_am_bmnk.cpp
View file @
7a3b49e5
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <cstdlib>
#include "check_err.hpp"
#include "config.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "device_tensor.hpp"
#include "binary_element_wise_operation.hpp"
#include "device_binary_elementwise.hpp"
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/device_binary_elementwise.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/host_tensor/device_memory.hpp"
#include "ck/library/host_tensor/host_tensor.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp"
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
...
...
@@ -17,8 +20,7 @@ using ABDataType = F16;
using
CDataType
=
F16
;
using
EltwiseComputeDataType
=
F32
;
using
Add
=
ck
::
tensor_operation
::
binary_element_wise
::
Add
<
EltwiseComputeDataType
,
EltwiseComputeDataType
,
EltwiseComputeDataType
>
;
using
Add
=
ck
::
tensor_operation
::
element_wise
::
Add
;
using
DeviceElementwiseAddInstance
=
ck
::
tensor_operation
::
device
::
DeviceBinaryElementwise
<
ABDataType
,
...
...
Prev
1
2
3
4
5
6
…
30
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment