Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
b79df771
Commit
b79df771
authored
Jul 12, 2022
by
carlushuang
Browse files
Merge remote-tracking branch 'origin/develop' into cpu_avx2
parents
05d38218
63914743
Changes
692
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
579 additions
and
548 deletions
+579
-548
example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp
example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp
+14
-11
example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp
example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp
+14
-11
example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp
example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp
+14
-11
example/10_conv2d_bwd_data/conv2d_bwd_data_xdl.cpp
example/10_conv2d_bwd_data/conv2d_bwd_data_xdl.cpp
+15
-14
example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp
example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp
+15
-14
example/12_reduce/README.md
example/12_reduce/README.md
+6
-7
example/12_reduce/reduce_blockwise.cpp
example/12_reduce/reduce_blockwise.cpp
+44
-33
example/12_reduce/reduce_blockwise_two_call.cpp
example/12_reduce/reduce_blockwise_two_call.cpp
+58
-47
example/13_pool2d_fwd/pool2d_fwd_common.hpp
example/13_pool2d_fwd/pool2d_fwd_common.hpp
+22
-22
example/13_pool2d_fwd/pool2d_fwd_fp16.cpp
example/13_pool2d_fwd/pool2d_fwd_fp16.cpp
+6
-3
example/13_pool2d_fwd/pool2d_fwd_fp32.cpp
example/13_pool2d_fwd/pool2d_fwd_fp32.cpp
+6
-3
example/14_gemm_xdl_requant_relu_requant/gemm_xdl_requant_relu_requant_int8.cpp
...quant_relu_requant/gemm_xdl_requant_relu_requant_int8.cpp
+15
-15
example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
+15
-15
example/16_gemm_reduce/gemm_reduce_xdl_max_fp16.cpp
example/16_gemm_reduce/gemm_reduce_xdl_max_fp16.cpp
+66
-56
example/16_gemm_reduce/gemm_reduce_xdl_mean_squaremean_fp16.cpp
...e/16_gemm_reduce/gemm_reduce_xdl_mean_squaremean_fp16.cpp
+97
-91
example/17_convnd_bwd_data_xdl/convnd_bwd_data_xdl.cpp
example/17_convnd_bwd_data_xdl/convnd_bwd_data_xdl.cpp
+15
-14
example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp
...e/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp
+85
-74
example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp
example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp
+22
-43
example/19_binary_elementwise/broadcast_add_3d_am_bmnk.cpp
example/19_binary_elementwise/broadcast_add_3d_am_bmnk.cpp
+29
-21
example/19_binary_elementwise/elementwise_add_1d.cpp
example/19_binary_elementwise/elementwise_add_1d.cpp
+21
-43
No files found.
example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp
View file @
b79df771
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <cstdlib>
#include <iostream>
#include <iostream>
#include <numeric>
#include <numeric>
#include <type_traits>
#include <type_traits>
#include "c
heck_err
.hpp"
#include "c
k/ck
.hpp"
#include "c
onfig
.hpp"
#include "c
k/tensor_operation/gpu/device/tensor_layout
.hpp"
#include "c
onv_util
.hpp"
#include "c
k/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk
.hpp"
#include "
device
.hpp"
#include "
ck/tensor_operation/gpu/element/element_wise_operation
.hpp"
#include "device_tensor.hpp"
#include "
device_convnd_fwd_xdl_nhwc_kyxc_nhwk
.hpp"
#include "
ck/library/utility/check_err
.hpp"
#include "
element_wise_operation
.hpp"
#include "
ck/library/utility/conv_util
.hpp"
#include "
host_tensor
.hpp"
#include "
ck/library/host_tensor/device_memory
.hpp"
#include "host_tensor
_generat
or.hpp"
#include "
ck/library/
host_tensor
/host_tens
or.hpp"
#include "
reference_conv_fwd
.hpp"
#include "
ck/library/host_tensor/host_tensor_generator
.hpp"
#include "
tensor_layout
.hpp"
#include "
ck/library/reference_tensor_operation/cpu/reference_conv_fwd
.hpp"
namespace
{
namespace
{
...
...
example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp
View file @
b79df771
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <cstdlib>
#include <iostream>
#include <iostream>
#include <numeric>
#include <numeric>
#include <type_traits>
#include <type_traits>
#include "c
heck_err
.hpp"
#include "c
k/ck
.hpp"
#include "c
onfig
.hpp"
#include "c
k/tensor_operation/gpu/device/tensor_layout
.hpp"
#include "c
onv_util
.hpp"
#include "c
k/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk
.hpp"
#include "
device
.hpp"
#include "
ck/tensor_operation/gpu/element/element_wise_operation
.hpp"
#include "device_tensor.hpp"
#include "
device_convnd_fwd_xdl_nhwc_kyxc_nhwk
.hpp"
#include "
ck/library/utility/check_err
.hpp"
#include "
element_wise_operation
.hpp"
#include "
ck/library/utility/conv_util
.hpp"
#include "
host_tensor
.hpp"
#include "
ck/library/host_tensor/device_memory
.hpp"
#include "host_tensor
_generat
or.hpp"
#include "
ck/library/
host_tensor
/host_tens
or.hpp"
#include "
reference_conv_fwd
.hpp"
#include "
ck/library/host_tensor/host_tensor_generator
.hpp"
#include "
tensor_layout
.hpp"
#include "
ck/library/reference_tensor_operation/cpu/reference_conv_fwd
.hpp"
namespace
{
namespace
{
...
...
example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp
View file @
b79df771
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <cstdlib>
#include <iostream>
#include <iostream>
#include <numeric>
#include <numeric>
#include <type_traits>
#include <type_traits>
#include "c
heck_err
.hpp"
#include "c
k/ck
.hpp"
#include "c
onfig
.hpp"
#include "c
k/tensor_operation/gpu/device/tensor_layout
.hpp"
#include "c
onv_util
.hpp"
#include "c
k/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk
.hpp"
#include "
device
.hpp"
#include "
ck/tensor_operation/gpu/element/element_wise_operation
.hpp"
#include "device_tensor.hpp"
#include "
device_convnd_fwd_xdl_nhwc_kyxc_nhwk
.hpp"
#include "
ck/library/utility/check_err
.hpp"
#include "
element_wise_operation
.hpp"
#include "
ck/library/utility/conv_util
.hpp"
#include "
host_tensor
.hpp"
#include "
ck/library/host_tensor/device_memory
.hpp"
#include "host_tensor
_generat
or.hpp"
#include "
ck/library/
host_tensor
/host_tens
or.hpp"
#include "
reference_conv_fwd
.hpp"
#include "
ck/library/host_tensor/host_tensor_generator
.hpp"
#include "
tensor_layout
.hpp"
#include "
ck/library/reference_tensor_operation/cpu/reference_conv_fwd
.hpp"
namespace
{
namespace
{
...
...
example/10_conv2d_bwd_data/conv2d_bwd_data_xdl.cpp
View file @
b79df771
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <iostream>
#include <numeric>
#include <numeric>
#include <initializer_list>
#include <initializer_list>
#include <cstdlib>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "check_err.hpp"
#include "ck/tensor_operation/gpu/device/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp"
#include "config.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "print.hpp"
#include "device.hpp"
#include "ck/library/utility/check_err.hpp"
#include "host_tensor.hpp"
#include "ck/library/utility/conv_util.hpp"
#include "host_tensor_generator.hpp"
#include "ck/library/host_tensor/device_memory.hpp"
#include "device_tensor.hpp"
#include "ck/library/host_tensor/host_tensor.hpp"
#include "tensor_layout.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp"
#include "element_wise_operation.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp"
#include "device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp"
#include "reference_conv_bwd_data.hpp"
using
InDataType
=
ck
::
half_t
;
using
InDataType
=
ck
::
half_t
;
using
WeiDataType
=
ck
::
half_t
;
using
WeiDataType
=
ck
::
half_t
;
...
...
example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp
View file @
b79df771
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <iostream>
#include <numeric>
#include <numeric>
#include <initializer_list>
#include <initializer_list>
#include <cstdlib>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "check_err.hpp"
#include "ck/tensor_operation/gpu/device/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp"
#include "config.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "print.hpp"
#include "device.hpp"
#include "ck/library/utility/check_err.hpp"
#include "host_tensor.hpp"
#include "ck/library/utility/conv_util.hpp"
#include "host_tensor_generator.hpp"
#include "ck/library/host_tensor/device_memory.hpp"
#include "device_tensor.hpp"
#include "ck/library/host_tensor/host_tensor.hpp"
#include "tensor_layout.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp"
#include "element_wise_operation.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_backward_weight.hpp"
#include "device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp"
#include "reference_conv_backward_weight.hpp"
using
InDataType
=
ck
::
half_t
;
using
InDataType
=
ck
::
half_t
;
using
WeiDataType
=
ck
::
half_t
;
using
WeiDataType
=
ck
::
half_t
;
...
...
example/12_reduce/README.md
View file @
b79df771
...
@@ -5,14 +5,14 @@
...
@@ -5,14 +5,14 @@
# -D <xxx> : input 4-d tensor lengths
# -D <xxx> : input 4-d tensor lengths
# -v <x> : verification (0=no, 1=yes)
# -v <x> : verification (0=no, 1=yes)
#arg1: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
#arg1: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
#arg2: time kernel (0=no, 1=yes)
#arg2: time kernel (0=no, 1=yes)
./bin/example_reduce_blockwise
-D
16,64,32,960
-v
1 1 1
./bin/example_reduce_blockwise
-D
16,64,32,960
-v
1 1 1
```
```
Result
Result
```
```
./bin/example_reduce_blockwise -D 16,64,32,960 -v 1 1 1
./bin/example_reduce_blockwise -D 16,64,32,960 -v 1 1 1
launch_and_time_kernel: grid_dim {240, 1, 1}, block_dim {256, 1, 1}
launch_and_time_kernel: grid_dim {240, 1, 1}, block_dim {256, 1, 1}
Warm up 1 time
Warm up 1 time
Start running 10 times...
Start running 10 times...
Perf: 0.282592 ms, 222.641 GB/s, DeviceReduceBlockWise<256,M_C4_S1,K_C64_S1,InSrcVectorDim_0_InSrcVectorSize_1_OutDstVectorSize_1>
Perf: 0.282592 ms, 222.641 GB/s, DeviceReduceBlockWise<256,M_C4_S1,K_C64_S1,InSrcVectorDim_0_InSrcVectorSize_1_OutDstVectorSize_1>
...
@@ -24,19 +24,18 @@ Perf: 0.282592 ms, 222.641 GB/s, DeviceReduceBlockWise<256,M_C4_S1,K_C64_S1,InSr
...
@@ -24,19 +24,18 @@ Perf: 0.282592 ms, 222.641 GB/s, DeviceReduceBlockWise<256,M_C4_S1,K_C64_S1,InSr
```
bash
```
bash
#arg1: verification (0=no, 1=yes(
#arg1: verification (0=no, 1=yes(
#arg2: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
#arg2: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
#arg3: time kernel (0=no, 1=yes)
#arg3: time kernel (0=no, 1=yes)
./bin/example_reduce_blockwise_two_call 1 2 1
./bin/example_reduce_blockwise_two_call 1 2 1
```
Result
Result
```
```
./bin/example_reduce_blockwise_two_call 1 2 1
./bin/example_reduce_blockwise_two_call 1 2 1
launch_and_time_kernel: grid_dim {204800, 1, 1}, block_dim {256, 1, 1}
launch_and_time_kernel: grid_dim {204800, 1, 1}, block_dim {256, 1, 1}
Warm up 1 time
Warm up 1 time
Start running 10 times...
Start running 10 times...
launch_and_time_kernel: grid_dim {6400, 1, 1}, block_dim {256, 1, 1}
launch_and_time_kernel: grid_dim {6400, 1, 1}, block_dim {256, 1, 1}
Warm up 1 time
Warm up 1 time
Start running 10 times...
Start running 10 times...
Perf: 2.1791 ms, 771.42 GB/s, DeviceReduceBlockWise<256,M_C32_S1,K_C8_S1,InSrcVectorDim_1_InSrcVectorSize_1_OutDstVectorSize_1> => DeviceReduceBlockWise<256,M_C256_S1,K_C1_S1,InSrcVectorDim_1_InSrcVectorSize_1_OutDstVectorSize_1>
Perf: 2.1791 ms, 771.42 GB/s, DeviceReduceBlockWise<256,M_C32_S1,K_C8_S1,InSrcVectorDim_1_InSrcVectorSize_1_OutDstVectorSize_1> => DeviceReduceBlockWise<256,M_C256_S1,K_C1_S1,InSrcVectorDim_1_InSrcVectorSize_1_OutDstVectorSize_1>
```
```
example/12_reduce/reduce_blockwise.cpp
View file @
b79df771
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <iostream>
#include <numeric>
#include <numeric>
#include <initializer_list>
#include <initializer_list>
#include <cstdlib>
#include <cstdlib>
#include <getopt.h>
#include <getopt.h>
#include "check_err.hpp"
#include "ck/ck.hpp"
#include "config.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "print.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "device.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "ck/library/utility/check_err.hpp"
#include "device_tensor.hpp"
#include "ck/library/host_tensor/device_memory.hpp"
#include "device_base.hpp"
#include "ck/library/host_tensor/host_tensor.hpp"
#include "device_reduce_multiblock.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp"
#include "host_common_util.hpp"
#include "ck/library/host_tensor/host_common_util.hpp"
#include "host_reduction.hpp"
#include "ck/library/host_tensor/host_reduction.hpp"
#include "reduction_enums.hpp"
#include "reduction_operator_mapping.hpp"
using
namespace
ck
;
using
namespace
ck
;
using
namespace
ck
::
tensor_operation
::
device
;
using
namespace
ck
::
tensor_operation
::
device
;
...
@@ -33,11 +33,11 @@ constexpr ReduceTensorOp ReduceOpId = ReduceTensorOp::NORM2;
...
@@ -33,11 +33,11 @@ constexpr ReduceTensorOp ReduceOpId = ReduceTensorOp::NORM2;
constexpr
bool
PropagateNan
=
true
;
constexpr
bool
PropagateNan
=
true
;
constexpr
bool
OutputIndex
=
false
;
constexpr
bool
OutputIndex
=
false
;
using
ReduceOperation
=
typename
reduce_binary_operator
<
AccDataType
,
ReduceOpId
>::
opType
;
using
ReduceOperation
=
typename
reduce_binary_operator
<
ReduceOpId
>::
opType
;
using
InElementwiseOperation
=
using
InElementwiseOperation
=
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
InElementwiseOperation
;
typename
reduce_unary_operator
<
ReduceOpId
,
true
,
true
>::
InElementwiseOperation
;
using
AccElementwiseOperation
=
using
AccElementwiseOperation
=
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
AccElementwiseOperation
;
typename
reduce_unary_operator
<
ReduceOpId
,
true
,
true
>::
AccElementwiseOperation
;
using
DeviceReduceInstance
=
DeviceReduceMultiBlock
<
InDataType
,
using
DeviceReduceInstance
=
DeviceReduceMultiBlock
<
InDataType
,
AccDataType
,
AccDataType
,
...
@@ -247,6 +247,13 @@ int main(int argc, char* argv[])
...
@@ -247,6 +247,13 @@ int main(int argc, char* argv[])
DeviceMem
out_index_dev
(
indicesSizeInBytes
);
DeviceMem
out_index_dev
(
indicesSizeInBytes
);
InElementwiseOperation
in_elementwise_op
;
AccElementwiseOperation
acc_elementwise_op
;
std
::
tie
(
in_elementwise_op
,
acc_elementwise_op
)
=
reduce_unary_operator
<
ReduceOpId
,
true
,
true
>::
GetElementwiseOperator
(
static_cast
<
int32_t
>
(
reduce_total_length
));
if
(
args
.
do_verification
)
if
(
args
.
do_verification
)
{
{
ReductionHost
<
InDataType
,
ReductionHost
<
InDataType
,
...
@@ -261,8 +268,13 @@ int main(int argc, char* argv[])
...
@@ -261,8 +268,13 @@ int main(int argc, char* argv[])
OutputIndex
>
OutputIndex
>
hostReduce
(
in
.
mDesc
,
out_ref
.
mDesc
,
invariantDims
,
reduceDims
);
hostReduce
(
in
.
mDesc
,
out_ref
.
mDesc
,
invariantDims
,
reduceDims
);
hostReduce
.
Run
(
hostReduce
.
Run
(
alpha
,
alpha
,
in
.
mData
.
data
(),
beta
,
out_ref
.
mData
.
data
(),
out_indices_ref
.
mData
.
data
());
in
.
mData
.
data
(),
beta
,
out_ref
.
mData
.
data
(),
out_indices_ref
.
mData
.
data
(),
in_elementwise_op
,
acc_elementwise_op
);
};
};
std
::
vector
<
ck
::
index_t
>
i_inLengths
;
std
::
vector
<
ck
::
index_t
>
i_inLengths
;
...
@@ -277,20 +289,19 @@ int main(int argc, char* argv[])
...
@@ -277,20 +289,19 @@ int main(int argc, char* argv[])
auto
reduce
=
DeviceReduceInstance
{};
auto
reduce
=
DeviceReduceInstance
{};
auto
argument_ptr
=
reduce
.
MakeArgumentPointer
(
auto
argument_ptr
=
reduce
.
MakeArgumentPointer
(
i_inLengths
,
i_inLengths
,
i_inStrides
,
i_inStrides
,
i_outLengths
,
i_outLengths
,
i_outStrides
,
i_outStrides
,
reduceDims
,
reduceDims
,
alpha
,
alpha
,
beta
,
beta
,
in_dev
.
GetDeviceBuffer
(),
in_dev
.
GetDeviceBuffer
(),
nullptr
,
nullptr
,
out_dev
.
GetDeviceBuffer
(),
out_dev
.
GetDeviceBuffer
(),
out_index_dev
.
GetDeviceBuffer
(),
out_index_dev
.
GetDeviceBuffer
(),
in_elementwise_op
,
InElementwiseOperation
{
static_cast
<
int32_t
>
(
reduce_total_length
)},
acc_elementwise_op
);
AccElementwiseOperation
{
static_cast
<
int32_t
>
(
reduce_total_length
)});
if
(
!
reduce
.
IsSupportedArgument
(
argument_ptr
.
get
()))
if
(
!
reduce
.
IsSupportedArgument
(
argument_ptr
.
get
()))
{
{
...
...
example/12_reduce/reduce_blockwise_two_call.cpp
View file @
b79df771
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <iostream>
#include <numeric>
#include <numeric>
#include <sstream>
#include <sstream>
...
@@ -5,20 +8,17 @@
...
@@ -5,20 +8,17 @@
#include <cstdlib>
#include <cstdlib>
#include <getopt.h>
#include <getopt.h>
#include "check_err.hpp"
#include "ck/ck.hpp"
#include "config.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "print.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "device.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "ck/library/utility/check_err.hpp"
#include "device_tensor.hpp"
#include "ck/library/host_tensor/device_memory.hpp"
#include "device_base.hpp"
#include "ck/library/host_tensor/host_tensor.hpp"
#include "device_reduce_multiblock.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp"
#include "host_common_util.hpp"
#include "ck/library/host_tensor/host_common_util.hpp"
#include "host_reduction.hpp"
#include "ck/library/host_tensor/host_reduction.hpp"
#include "reduction_enums.hpp"
#include "reduction_operator_mapping.hpp"
using
namespace
ck
;
using
namespace
ck
;
using
namespace
ck
::
tensor_operation
::
device
;
using
namespace
ck
::
tensor_operation
::
device
;
...
@@ -31,13 +31,13 @@ constexpr ReduceTensorOp ReduceOpId = ReduceTensorOp::NORM2;
...
@@ -31,13 +31,13 @@ constexpr ReduceTensorOp ReduceOpId = ReduceTensorOp::NORM2;
constexpr
bool
PropagateNan
=
true
;
constexpr
bool
PropagateNan
=
true
;
constexpr
bool
OutputIndex
=
false
;
constexpr
bool
OutputIndex
=
false
;
using
ReduceOperation
=
typename
reduce_binary_operator
<
AccDataType
,
ReduceOpId
>::
opType
;
using
ReduceOperation
=
typename
reduce_binary_operator
<
ReduceOpId
>::
opType
;
using
InElementwiseOperation
=
using
InElementwiseOperation
=
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
InElementwiseOperation
;
typename
reduce_unary_operator
<
ReduceOpId
,
true
,
true
>::
InElementwiseOperation
;
using
AccElementwiseOperation
=
using
AccElementwiseOperation
=
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
AccElementwiseOperation
;
typename
reduce_unary_operator
<
ReduceOpId
,
true
,
true
>::
AccElementwiseOperation
;
using
PassThroughOp
=
tensor_operation
::
element_wise
::
UnaryIdentic
<
AccDataType
,
AccDataType
>
;
using
PassThroughOp
=
tensor_operation
::
element_wise
::
PassThrough
;
using
DeviceReduceInstance_1
=
DeviceReduceMultiBlock
<
InOutDataType
,
using
DeviceReduceInstance_1
=
DeviceReduceMultiBlock
<
InOutDataType
,
AccDataType
,
AccDataType
,
...
@@ -184,6 +184,13 @@ int main(int argc, char* argv[])
...
@@ -184,6 +184,13 @@ int main(int argc, char* argv[])
if
(
beta
!=
0.0
f
)
if
(
beta
!=
0.0
f
)
out_dev
.
ToDevice
(
out
.
mData
.
data
());
out_dev
.
ToDevice
(
out
.
mData
.
data
());
InElementwiseOperation
in_elementwise_op
;
AccElementwiseOperation
acc_elementwise_op
;
std
::
tie
(
in_elementwise_op
,
acc_elementwise_op
)
=
reduce_unary_operator
<
ReduceOpId
,
true
,
true
>::
GetElementwiseOperator
(
static_cast
<
int32_t
>
(
reduce_total_length
));
if
(
do_verify
)
if
(
do_verify
)
{
{
ReductionHost
<
InOutDataType
,
ReductionHost
<
InOutDataType
,
...
@@ -198,7 +205,13 @@ int main(int argc, char* argv[])
...
@@ -198,7 +205,13 @@ int main(int argc, char* argv[])
OutputIndex
>
OutputIndex
>
hostReduce
(
in_1
.
mDesc
,
out_ref
.
mDesc
,
invariantDims
,
reduceDims
);
hostReduce
(
in_1
.
mDesc
,
out_ref
.
mDesc
,
invariantDims
,
reduceDims
);
hostReduce
.
Run
(
alpha
,
in_1
.
mData
.
data
(),
beta
,
out_ref
.
mData
.
data
(),
nullptr
);
hostReduce
.
Run
(
alpha
,
in_1
.
mData
.
data
(),
beta
,
out_ref
.
mData
.
data
(),
nullptr
,
in_elementwise_op
,
acc_elementwise_op
);
};
};
std
::
vector
<
ck
::
index_t
>
i_inLengths_1
;
std
::
vector
<
ck
::
index_t
>
i_inLengths_1
;
...
@@ -217,20 +230,19 @@ int main(int argc, char* argv[])
...
@@ -217,20 +230,19 @@ int main(int argc, char* argv[])
auto
reduce_1
=
DeviceReduceInstance_1
{};
auto
reduce_1
=
DeviceReduceInstance_1
{};
auto
argument_ptr_1
=
reduce_1
.
MakeArgumentPointer
(
auto
argument_ptr_1
=
reduce_1
.
MakeArgumentPointer
(
i_inLengths_1
,
i_inLengths_1
,
i_inStrides_1
,
i_inStrides_1
,
i_inLengths_2
,
i_inLengths_2
,
i_inStrides_2
,
i_inStrides_2
,
reduceDims_1
,
reduceDims_1
,
1.0
f
,
1.0
f
,
0.0
f
,
0.0
f
,
in_1_dev
.
GetDeviceBuffer
(),
in_1_dev
.
GetDeviceBuffer
(),
nullptr
,
nullptr
,
in_2_dev
.
GetDeviceBuffer
(),
in_2_dev
.
GetDeviceBuffer
(),
nullptr
,
nullptr
,
in_elementwise_op
,
InElementwiseOperation
{
static_cast
<
int32_t
>
(
reduce_total_length
)},
PassThroughOp
{});
PassThroughOp
{});
if
(
!
reduce_1
.
IsSupportedArgument
(
argument_ptr_1
.
get
()))
if
(
!
reduce_1
.
IsSupportedArgument
(
argument_ptr_1
.
get
()))
{
{
...
@@ -243,20 +255,19 @@ int main(int argc, char* argv[])
...
@@ -243,20 +255,19 @@ int main(int argc, char* argv[])
auto
reduce_2
=
DeviceReduceInstance_2
{};
auto
reduce_2
=
DeviceReduceInstance_2
{};
auto
argument_ptr_2
=
reduce_2
.
MakeArgumentPointer
(
auto
argument_ptr_2
=
reduce_2
.
MakeArgumentPointer
(
i_inLengths_2
,
i_inLengths_2
,
i_inStrides_2
,
i_inStrides_2
,
i_outLengths
,
i_outLengths
,
i_outStrides
,
i_outStrides
,
reduceDims_2
,
reduceDims_2
,
alpha
,
alpha
,
beta
,
beta
,
in_2_dev
.
GetDeviceBuffer
(),
in_2_dev
.
GetDeviceBuffer
(),
nullptr
,
nullptr
,
out_dev
.
GetDeviceBuffer
(),
out_dev
.
GetDeviceBuffer
(),
nullptr
,
nullptr
,
PassThroughOp
{},
PassThroughOp
{},
acc_elementwise_op
);
AccElementwiseOperation
{
static_cast
<
int32_t
>
(
reduce_total_length
)});
if
(
!
reduce_2
.
IsSupportedArgument
(
argument_ptr_2
.
get
()))
if
(
!
reduce_2
.
IsSupportedArgument
(
argument_ptr_2
.
get
()))
{
{
...
...
example/13_pool2d_fwd/pool2d_fwd_common.hpp
View file @
b79df771
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
#include <iostream>
#include <iostream>
#include "check_err.hpp"
#include "ck/ck.hpp"
#include "config.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "print.hpp"
#include "ck/utility/reduction_functions_accumulate.hpp"
#include "device.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "host_tensor.hpp"
#include "ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp"
#include "host_tensor_generator.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "device_tensor.hpp"
#include "tensor_layout.hpp"
#include "reduction_enums.hpp"
#include "reduction_operator_mapping.hpp"
#include "reduction_functions_accumulate.hpp"
#include "device_pool2d_fwd_nhwc_nhwc.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/host_tensor/device_memory.hpp"
#include "ck/library/host_tensor/host_tensor.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp"
template
<
typename
InDataType
,
template
<
typename
InDataType
,
typename
OutDataType
,
typename
OutDataType
,
...
@@ -31,16 +32,15 @@ static void pool_host_verify(const Tensor<InDataType>& in,
...
@@ -31,16 +32,15 @@ static void pool_host_verify(const Tensor<InDataType>& in,
const
std
::
array
<
ck
::
index_t
,
2
>&
in_left_pads
,
const
std
::
array
<
ck
::
index_t
,
2
>&
in_left_pads
,
const
std
::
array
<
ck
::
index_t
,
2
>&
/*in_right_pads*/
)
const
std
::
array
<
ck
::
index_t
,
2
>&
/*in_right_pads*/
)
{
{
const
int32_t
divider
=
window_spatial_lengths
[
0
]
*
window_spatial_lengths
[
1
];
const
int32_t
reduceLength
=
window_spatial_lengths
[
0
]
*
window_spatial_lengths
[
1
];
using
ReduceOperation
=
typename
ck
::
reduce_binary_operator
<
ReduceOpId
>::
opType
;
using
ReduceOperation
=
typename
ck
::
reduce_binary_operator
<
AccDataType
,
ReduceOpId
>::
opType
;
auto
elementwise_ops
=
using
InElementwiseOperation
=
typename
ck
::
ck
::
reduce_unary_operator
<
ReduceOpId
,
true
,
true
>::
GetElementwiseOperator
(
reduceLength
);
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
InElementwiseOperation
;
using
AccElementwiseOperation
=
typename
ck
::
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
AccElementwiseOperation
;
const
InE
lementwise
Operation
in_
elementwise_op
(
divider
);
auto
in_e
lementwise
_op
=
std
::
get
<
0
>
(
elementwise_op
s
);
const
AccElementwiseOperation
acc_
elementwise_op
(
divider
);
auto
acc_elementwise_op
=
std
::
get
<
1
>
(
elementwise_op
s
);
if
constexpr
(
!
OutputIndex
)
if
constexpr
(
!
OutputIndex
)
{
{
...
@@ -48,7 +48,7 @@ static void pool_host_verify(const Tensor<InDataType>& in,
...
@@ -48,7 +48,7 @@ static void pool_host_verify(const Tensor<InDataType>& in,
ck
::
detail
::
AccumulateWithNanCheck
<
PropagateNan
,
ReduceOperation
,
AccDataType
>
;
ck
::
detail
::
AccumulateWithNanCheck
<
PropagateNan
,
ReduceOperation
,
AccDataType
>
;
auto
f_nchw
=
[
&
](
auto
n
,
auto
c
,
auto
ho
,
auto
wo
)
{
auto
f_nchw
=
[
&
](
auto
n
,
auto
c
,
auto
ho
,
auto
wo
)
{
auto
accuVal
=
ReduceOperation
::
GetIdentityValue
();
auto
accuVal
=
ReduceOperation
::
template
GetIdentityValue
<
AccDataType
>
();
for
(
ck
::
index_t
y
=
0
;
y
<
window_spatial_lengths
[
0
];
++
y
)
for
(
ck
::
index_t
y
=
0
;
y
<
window_spatial_lengths
[
0
];
++
y
)
{
{
...
@@ -86,7 +86,7 @@ static void pool_host_verify(const Tensor<InDataType>& in,
...
@@ -86,7 +86,7 @@ static void pool_host_verify(const Tensor<InDataType>& in,
AccDataType
,
AccDataType
,
IndexDataType
>
;
IndexDataType
>
;
auto
f_nchw
=
[
&
](
auto
n
,
auto
c
,
auto
ho
,
auto
wo
)
{
auto
f_nchw
=
[
&
](
auto
n
,
auto
c
,
auto
ho
,
auto
wo
)
{
auto
accuVal
=
ReduceOperation
::
GetIdentityValue
();
auto
accuVal
=
ReduceOperation
::
template
GetIdentityValue
<
AccDataType
>
();
IndexDataType
accuIndex
=
0
;
IndexDataType
accuIndex
=
0
;
for
(
ck
::
index_t
y
=
0
;
y
<
window_spatial_lengths
[
0
];
++
y
)
for
(
ck
::
index_t
y
=
0
;
y
<
window_spatial_lengths
[
0
];
++
y
)
...
...
example/13_pool2d_fwd/pool2d_fwd_fp16.cpp
View file @
b79df771
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <iostream>
#include <cstdlib>
#include <cstdlib>
#include "c
onfig
.hpp"
#include "c
k/ck
.hpp"
#include "tensor_layout.hpp"
#include "
ck/tensor_operation/gpu/device/
tensor_layout.hpp"
#include "reduction_enums.hpp"
#include "
ck/utility/
reduction_enums.hpp"
#include "pool2d_fwd_common.hpp"
#include "pool2d_fwd_common.hpp"
...
...
example/13_pool2d_fwd/pool2d_fwd_fp32.cpp
View file @
b79df771
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <iostream>
#include <cstdlib>
#include <cstdlib>
#include "c
onfig
.hpp"
#include "c
k/ck
.hpp"
#include "
tensor_layout
.hpp"
#include "
ck/utility/reduction_enums
.hpp"
#include "
reduction_enums
.hpp"
#include "
ck/tensor_operation/gpu/device/tensor_layout
.hpp"
#include "pool2d_fwd_common.hpp"
#include "pool2d_fwd_common.hpp"
...
...
example/14_gemm_xdl_requant_relu_requant/gemm_xdl_requant_relu_requant_int8.cpp
View file @
b79df771
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <iostream>
#include <numeric>
#include <numeric>
#include <initializer_list>
#include <initializer_list>
#include <cstdlib>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "check_err.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "config.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp"
#include "print.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "ck/library/host_tensor/device_memory.hpp"
#include "host_tensor_generator.hpp"
#include "ck/library/host_tensor/host_tensor.hpp"
#include "host_gemm.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp"
#include "device_tensor.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "device_gemm_xdl_cshuffle.hpp"
#include "ck/library/utility/check_err.hpp"
#include "element_wise_operation.hpp"
#include "reference_gemm.hpp"
#include "gemm_specialization.hpp"
struct
RequantReluRequant
struct
RequantReluRequant
{
{
...
...
example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
View file @
b79df771
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <iostream>
#include <numeric>
#include <numeric>
#include <initializer_list>
#include <initializer_list>
#include <cstdlib>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "check_err.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "config.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp"
#include "print.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "ck/library/utility/check_err.hpp"
#include "host_tensor_generator.hpp"
#include "ck/library/host_tensor/device_memory.hpp"
#include "host_gemm.hpp"
#include "ck/library/host_tensor/host_tensor.hpp"
#include "device_tensor.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp"
#include "device_grouped_gemm_xdl.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "element_wise_operation.hpp"
#include "reference_gemm.hpp"
#include "gemm_specialization.hpp"
template
<
ck
::
index_t
...
Is
>
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
S
=
ck
::
Sequence
<
Is
...
>
;
...
...
example/16_gemm_reduce/gemm_reduce_xdl_max_fp16.cpp
View file @
b79df771
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <iostream>
#include <numeric>
#include <numeric>
#include <initializer_list>
#include <initializer_list>
#include <cstdlib>
#include <cstdlib>
#include <stdlib.h>
#include "ck/ck.hpp"
#include "check_err.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "config.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "device.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_reduce_xdl_cshuffle.hpp"
#include "host_tensor.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "host_tensor_generator.hpp"
#include "device_tensor.hpp"
#include "ck/library/host_tensor/device_memory.hpp"
#include "device_gemm_reduce_xdl_cshuffle.hpp"
#include "ck/library/host_tensor/host_tensor.hpp"
#include "element_wise_operation.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp"
#include "reference_gemm.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "gemm_specialization.hpp"
#include "ck/library/utility/check_err.hpp"
#include "element_wise_reduce_operation.hpp"
template
<
ck
::
index_t
...
Is
>
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
S
=
ck
::
Sequence
<
Is
...
>
;
...
@@ -31,20 +33,19 @@ using BDataType = F16;
...
@@ -31,20 +33,19 @@ using BDataType = F16;
using
CDataType
=
F16
;
using
CDataType
=
F16
;
using
GemmAccDataType
=
F32
;
using
GemmAccDataType
=
F32
;
using
ReduceAccDataType
=
F32
;
using
ReduceAccDataType
=
F32
;
using
D
DataType
=
F64
;
using
Reduce
DataType
=
F64
;
using
D
PtrsGlobal
=
ck
::
Tuple
<
D
DataType
*>
;
using
Reduce
PtrsGlobal
=
ck
::
Tuple
<
Reduce
DataType
*>
;
using
ALayout
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
ALayout
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
BLayout
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
using
BLayout
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
using
CLayout
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
CLayout
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
AElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
AElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
BElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
BElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
CElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
CElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
DsReduceOp
=
ck
::
Tuple
<
ck
::
reduce
::
Max
<
ReduceAccDataType
>>
;
using
ReduceOps
=
ck
::
Tuple
<
ck
::
reduce
::
Max
>
;
using
DsElementOp
=
ck
::
Tuple
<
using
ReduceElementOps
=
ck
::
Tuple
<
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
;
ck
::
tensor_operation
::
element_wise
::
UnaryIdentic
<
ReduceAccDataType
,
ReduceAccDataType
,
false
>>
;
using
ReduceGlobalMemOps
=
using
DGlobalMemOp
=
ck
::
InMemoryDataOperationEnumSequence
<
ck
::
InMemoryDataOperationEnum
::
AtomicMax
>
;
ck
::
InMemoryDataOperationEnumSequence
<
ck
::
InMemoryDataOperationEnum
::
AtomicMax
>
;
static
constexpr
auto
GemmSpecialization
=
static
constexpr
auto
GemmSpecialization
=
...
@@ -52,11 +53,11 @@ static constexpr auto GemmSpecialization =
...
@@ -52,11 +53,11 @@ static constexpr auto GemmSpecialization =
// clang-format off
// clang-format off
using
DeviceGemmReduceInstance
=
ck
::
tensor_operation
::
device
::
DeviceGemmReduce_Xdl_CShuffle
using
DeviceGemmReduceInstance
=
ck
::
tensor_operation
::
device
::
DeviceGemmReduce_Xdl_CShuffle
//######| ALayout| BLayout| CLayout|AData| BData| CData| GemmAcc| CShuffle| ReduceAcc|
D
Data| A| B| C|
Dxs| Dxs
InEleOp|
Dxs
AccEleOp|
D
| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| CReduce| CReduceThreadLds2VGprCopy| CReduceThreadVgpr2GlobalCopy|
//######| ALayout| BLayout| CLayout|AData| BData| CData| GemmAcc| CShuffle| ReduceAcc|
Reduce
Data| A| B| C|
Reduce| Reduce
InEleOp|
Reduce
AccEleOp|
Reduce
| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| CReduce| CReduceThreadLds2VGprCopy| CReduceThreadVgpr2GlobalCopy|
//######| | | | Type| Type| Type| DataType| DataType| DataType| Type Tuple| Elementwise| Elementwise| Elementwise|
Reduce|
|
|
MemoryData| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MPerBlock| ScalarPerVector| ThreadClusterLengths| SrcDstScalarPerVector| SrcDstScalarPerVector|
//######| | | | Type| Type| Type| DataType| DataType| DataType|
Type Tuple| Elementwise| Elementwise| Elementwise|
Operation|
|
|
MemoryData| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MPerBlock| ScalarPerVector| ThreadClusterLengths| SrcDstScalarPerVector| SrcDstScalarPerVector|
//######| | | | | | | | | |
|
Operation
| Operation| Operation| Operation|
| | Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NPerBlock| _NPerBlock| _MPerBlock_NPerBlock| _NPerBlock| _MPerBlock|
//######| | | | | | | | | | | Operation| Operation| Operation| |
|
|
Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NPerBlock| _NPerBlock| _MPerBlock_NPerBlock| _NPerBlock| _MPerBlock|
//######| | | | | | | | | | | | | | |
|
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
//######| | | | | | | | | |
| | | | |
|
|
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
<
Row
,
Col
,
Row
,
F16
,
F16
,
F16
,
F32
,
F32
,
ReduceAccDataType
,
D
PtrsGlobal
,
AElementOp
,
BElementOp
,
CElementOp
,
Ds
ReduceOp
,
Ds
ElementOp
,
Ds
ElementOp
,
D
GlobalMemOp
,
GemmSpecialization
,
1
,
256
,
256
,
128
,
32
,
8
,
8
,
32
,
32
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
,
S
<
64
,
4
>
,
4
,
1
>
;
<
Row
,
Col
,
Row
,
F16
,
F16
,
F16
,
F32
,
F32
,
ReduceAccDataType
,
Reduce
PtrsGlobal
,
AElementOp
,
BElementOp
,
CElementOp
,
ReduceOp
s
,
Reduce
ElementOp
s
,
Reduce
ElementOp
s
,
Reduce
GlobalMemOp
s
,
GemmSpecialization
,
1
,
256
,
256
,
128
,
32
,
8
,
8
,
32
,
32
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
,
S
<
64
,
4
>
,
4
,
1
>
;
// clang-format on
// clang-format on
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGemm
<
ADataType
,
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGemm
<
ADataType
,
...
@@ -67,12 +68,12 @@ using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataTyp
...
@@ -67,12 +68,12 @@ using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataTyp
BElementOp
,
BElementOp
,
CElementOp
>
;
CElementOp
>
;
template
<
typename
ADataType
,
typename
BDataType
,
typename
CDataType
,
typename
D
DataType
>
template
<
typename
ADataType
,
typename
BDataType
,
typename
CDataType
,
typename
Reduce
DataType
>
void
DumpGemmLayerNormPerf
(
float
gemm_reduce_time
,
int
M
,
int
N
,
int
K
)
void
DumpGemmLayerNormPerf
(
float
gemm_reduce_time
,
int
M
,
int
N
,
int
K
)
{
{
std
::
size_t
gemm_flop
=
std
::
size_t
(
2
)
*
M
*
N
*
K
;
std
::
size_t
gemm_flop
=
std
::
size_t
(
2
)
*
M
*
N
*
K
;
std
::
size_t
gemm_num_byte
=
sizeof
(
ADataType
)
*
M
*
K
+
sizeof
(
BDataType
)
*
K
*
N
+
std
::
size_t
gemm_num_byte
=
sizeof
(
ADataType
)
*
M
*
K
+
sizeof
(
BDataType
)
*
K
*
N
+
sizeof
(
CDataType
)
*
M
*
N
+
sizeof
(
D
DataType
)
*
M
;
sizeof
(
CDataType
)
*
M
*
N
+
sizeof
(
Reduce
DataType
)
*
M
;
float
tflops
=
static_cast
<
float
>
(
gemm_flop
)
/
1.E9
/
gemm_reduce_time
;
float
tflops
=
static_cast
<
float
>
(
gemm_flop
)
/
1.E9
/
gemm_reduce_time
;
float
gemm_gb_per_sec
=
gemm_num_byte
/
1.E6
/
gemm_reduce_time
;
float
gemm_gb_per_sec
=
gemm_num_byte
/
1.E6
/
gemm_reduce_time
;
...
@@ -147,17 +148,17 @@ int main(int argc, char* argv[])
...
@@ -147,17 +148,17 @@ int main(int argc, char* argv[])
Tensor
<
BDataType
>
b_k_n
(
f_host_tensor_descriptor
(
K
,
N
,
StrideB
,
BLayout
{}));
Tensor
<
BDataType
>
b_k_n
(
f_host_tensor_descriptor
(
K
,
N
,
StrideB
,
BLayout
{}));
Tensor
<
CDataType
>
c_m_n_host_result
(
f_host_tensor_descriptor
(
M
,
N
,
StrideC
,
CLayout
{}));
Tensor
<
CDataType
>
c_m_n_host_result
(
f_host_tensor_descriptor
(
M
,
N
,
StrideC
,
CLayout
{}));
Tensor
<
D
DataType
>
d
_m_host_result
(
Tensor
<
Reduce
DataType
>
reduce
_m_host_result
(
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
static_cast
<
std
::
size_t
>
(
M
)})));
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
static_cast
<
std
::
size_t
>
(
M
)})));
Tensor
<
CDataType
>
c_m_n_device_result
(
f_host_tensor_descriptor
(
M
,
N
,
StrideC
,
CLayout
{}));
Tensor
<
CDataType
>
c_m_n_device_result
(
f_host_tensor_descriptor
(
M
,
N
,
StrideC
,
CLayout
{}));
Tensor
<
D
DataType
>
d
_m_device_result
(
Tensor
<
Reduce
DataType
>
reduce
_m_device_result
(
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
static_cast
<
std
::
size_t
>
(
M
)})));
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
static_cast
<
std
::
size_t
>
(
M
)})));
std
::
cout
<<
"a_m_k: "
<<
a_m_k
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"a_m_k: "
<<
a_m_k
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"b_k_n: "
<<
b_k_n
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"b_k_n: "
<<
b_k_n
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"c_m_n: "
<<
c_m_n_host_result
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"c_m_n: "
<<
c_m_n_host_result
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"
d
_m: "
<<
d
_m_host_result
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"
reduce
_m: "
<<
reduce
_m_host_result
.
mDesc
<<
std
::
endl
;
switch
(
init_method
)
switch
(
init_method
)
{
{
...
@@ -175,35 +176,40 @@ int main(int argc, char* argv[])
...
@@ -175,35 +176,40 @@ int main(int argc, char* argv[])
DeviceMem
a_device_buf
(
sizeof
(
ADataType
)
*
a_m_k
.
mDesc
.
GetElementSpace
());
DeviceMem
a_device_buf
(
sizeof
(
ADataType
)
*
a_m_k
.
mDesc
.
GetElementSpace
());
DeviceMem
b_device_buf
(
sizeof
(
BDataType
)
*
b_k_n
.
mDesc
.
GetElementSpace
());
DeviceMem
b_device_buf
(
sizeof
(
BDataType
)
*
b_k_n
.
mDesc
.
GetElementSpace
());
DeviceMem
c_device_buf
(
sizeof
(
CDataType
)
*
c_m_n_device_result
.
mDesc
.
GetElementSpace
());
DeviceMem
c_device_buf
(
sizeof
(
CDataType
)
*
c_m_n_device_result
.
mDesc
.
GetElementSpace
());
DeviceMem
d_device_buf
(
sizeof
(
DDataType
)
*
d_m_device_result
.
mDesc
.
GetElementSpace
());
DeviceMem
reduce_device_buf
(
sizeof
(
ReduceDataType
)
*
reduce_m_device_result
.
mDesc
.
GetElementSpace
());
a_device_buf
.
ToDevice
(
a_m_k
.
mData
.
data
());
a_device_buf
.
ToDevice
(
a_m_k
.
mData
.
data
());
b_device_buf
.
ToDevice
(
b_k_n
.
mData
.
data
());
b_device_buf
.
ToDevice
(
b_k_n
.
mData
.
data
());
auto
a_element_op
=
AElementOp
{};
auto
a_element_op
=
AElementOp
{};
auto
b_element_op
=
BElementOp
{};
auto
b_element_op
=
BElementOp
{};
auto
c_element_op
=
CElementOp
{};
auto
c_element_op
=
CElementOp
{};
auto
ds_element_op
=
DsElementOp
{};
auto
reduce_element_op
=
ReduceElementOps
{}[
ck
::
Number
<
0
>
{}];
auto
p_ds_global
=
ck
::
make_tuple
(
static_cast
<
DDataType
*>
(
d_device_buf
.
GetDeviceBuffer
()));
std
::
array
<
void
*
,
3
>
gemm_element_ops
=
{
&
a_element_op
,
&
b_element_op
,
&
c_element_op
};
std
::
array
<
void
*
,
1
>
reduce_element_ops
=
{
&
reduce_element_op
};
std
::
array
<
void
*
,
1
>
p_reduces
=
{
reduce_device_buf
.
GetDeviceBuffer
()};
// do GEMM
// do GEMM
auto
gemm
=
DeviceGemmReduceInstance
{};
auto
gemm
=
DeviceGemmReduceInstance
{};
auto
invoker
=
gemm
.
MakeInvoker
();
auto
invoker
=
gemm
.
MakeInvoker
();
auto
argument
=
gemm
.
MakeArgument
(
static_cast
<
ADataType
*>
(
a_device_buf
.
GetDeviceBuffer
()),
auto
argument
=
gemm
.
MakeArgument
(
a_device_buf
.
GetDeviceBuffer
(),
static_cast
<
BDataType
*>
(
b_device_buf
.
GetDeviceBuffer
()),
b_device_buf
.
GetDeviceBuffer
(),
static_cast
<
CDataType
*>
(
c_device_buf
.
GetDeviceBuffer
()),
nullptr
,
p_ds_global
,
{},
c_device_buf
.
GetDeviceBuffer
(),
p_reduces
,
M
,
M
,
N
,
N
,
K
,
K
,
StrideA
,
StrideA
,
StrideB
,
StrideB
,
StrideC
,
StrideC
,
a_element_op
,
{}
,
b
_element_op
,
gemm
_element_op
s
,
c_element_op
,
{}
,
ds
_element_op
,
reduce
_element_op
s
,
ds
_element_op
);
reduce
_element_op
s
);
if
(
!
gemm
.
IsSupportedArgument
(
argument
))
if
(
!
gemm
.
IsSupportedArgument
(
argument
))
{
{
...
@@ -214,7 +220,7 @@ int main(int argc, char* argv[])
...
@@ -214,7 +220,7 @@ int main(int argc, char* argv[])
// [CAUSION]: launch_and_time_kernel will not initialize D.
// [CAUSION]: launch_and_time_kernel will not initialize D.
// If we evaluate kernel multiple time but without initialize D. Verification will fail
// If we evaluate kernel multiple time but without initialize D. Verification will fail
d
_device_buf
.
SetValue
(
ck
::
NumericLimits
<
D
DataType
>::
Lowest
());
reduce
_device_buf
.
SetValue
(
ck
::
NumericLimits
<
Reduce
DataType
>::
Lowest
());
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
false
});
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
false
});
bool
pass
=
true
;
bool
pass
=
true
;
...
@@ -222,7 +228,7 @@ int main(int argc, char* argv[])
...
@@ -222,7 +228,7 @@ int main(int argc, char* argv[])
if
(
do_verification
)
if
(
do_verification
)
{
{
c_device_buf
.
FromDevice
(
c_m_n_device_result
.
mData
.
data
());
c_device_buf
.
FromDevice
(
c_m_n_device_result
.
mData
.
data
());
d
_device_buf
.
FromDevice
(
d
_m_device_result
.
mData
.
data
());
reduce
_device_buf
.
FromDevice
(
reduce
_m_device_result
.
mData
.
data
());
auto
ref_gemm
=
ReferenceGemmInstance
{};
auto
ref_gemm
=
ReferenceGemmInstance
{};
auto
ref_invoker
=
ref_gemm
.
MakeInvoker
();
auto
ref_invoker
=
ref_gemm
.
MakeInvoker
();
...
@@ -232,23 +238,27 @@ int main(int argc, char* argv[])
...
@@ -232,23 +238,27 @@ int main(int argc, char* argv[])
ref_invoker
.
Run
(
ref_argument
);
ref_invoker
.
Run
(
ref_argument
);
auto
d_
reduce_op
=
Ds
ReduceOp
{}[
ck
::
Number
<
0
>
{}];
auto
reduce_op
=
ReduceOp
s
{}[
ck
::
Number
<
0
>
{}];
for
(
int
m
=
0
;
m
<
M
;
++
m
)
for
(
int
m
=
0
;
m
<
M
;
++
m
)
{
{
ReduceAccDataType
d
_acc
=
d_
reduce_op
.
GetIdentityValue
();
ReduceAccDataType
reduce
_acc
=
reduce_op
.
GetIdentityValue
<
ReduceAccDataType
>
();
for
(
int
n
=
0
;
n
<
N
;
++
n
)
for
(
int
n
=
0
;
n
<
N
;
++
n
)
d_reduce_op
(
d_acc
,
c_m_n_host_result
(
m
,
n
));
{
ReduceAccDataType
curr_val
=
ck
::
type_convert
<
ReduceAccDataType
>
(
c_m_n_host_result
(
m
,
n
));
reduce_op
(
reduce_acc
,
curr_val
);
};
d
_m_host_result
(
m
)
=
d
_acc
;
reduce
_m_host_result
(
m
)
=
reduce
_acc
;
}
}
pass
=
ck
::
utils
::
check_err
(
c_m_n_device_result
.
mData
,
pass
=
ck
::
utils
::
check_err
(
c_m_n_device_result
.
mData
,
c_m_n_host_result
.
mData
,
c_m_n_host_result
.
mData
,
"Error: Incorrect results c"
)
&&
"Error: Incorrect results c"
)
&&
ck
::
utils
::
check_err
(
d
_m_device_result
.
mData
,
ck
::
utils
::
check_err
(
reduce
_m_device_result
.
mData
,
d
_m_host_result
.
mData
,
reduce
_m_host_result
.
mData
,
"Error: Incorrect results d"
,
"Error: Incorrect results d"
,
1e-3
,
1e-3
,
1e-3
);
1e-3
);
...
@@ -258,7 +268,7 @@ int main(int argc, char* argv[])
...
@@ -258,7 +268,7 @@ int main(int argc, char* argv[])
{
{
float
gemm_reduceMax_ave_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
true
});
float
gemm_reduceMax_ave_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
true
});
DumpGemmLayerNormPerf
<
ADataType
,
BDataType
,
CDataType
,
D
DataType
>
(
DumpGemmLayerNormPerf
<
ADataType
,
BDataType
,
CDataType
,
Reduce
DataType
>
(
gemm_reduceMax_ave_time
,
M
,
N
,
K
);
gemm_reduceMax_ave_time
,
M
,
N
,
K
);
}
}
...
...
example/16_gemm_reduce/gemm_reduce_xdl_mean_squaremean_fp16.cpp
View file @
b79df771
This diff is collapsed.
Click to expand it.
example/17_convnd_bwd_data_xdl/convnd_bwd_data_xdl.cpp
View file @
b79df771
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <iostream>
#include <numeric>
#include <numeric>
#include <initializer_list>
#include <initializer_list>
#include <cstdlib>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "config.hpp"
#include "ck/tensor_operation/gpu/device/device_convnd_bwd_data_xdl_ndhwc_kzyxc_ndhwk.hpp"
#include "conv_util.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "print.hpp"
#include "device.hpp"
#include "ck/library/utility/check_err.hpp"
#include "host_tensor.hpp"
#include "ck/library/utility/conv_util.hpp"
#include "host_tensor_generator.hpp"
#include "ck/library/host_tensor/device_memory.hpp"
#include "device_tensor.hpp"
#include "ck/library/host_tensor/host_tensor.hpp"
#include "tensor_layout.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp"
#include "element_wise_operation.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp"
#include "device_convnd_bwd_data_xdl_ndhwc_kzyxc_ndhwk.hpp"
#include "reference_conv_bwd_data.hpp"
using
InDataType
=
ck
::
half_t
;
using
InDataType
=
ck
::
half_t
;
using
WeiDataType
=
ck
::
half_t
;
using
WeiDataType
=
ck
::
half_t
;
...
...
example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp
View file @
b79df771
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <iostream>
#include <numeric>
#include <numeric>
#include <initializer_list>
#include <initializer_list>
#include <cstdlib>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "ck/ck.hpp"
#include "check_err.hpp"
#include "ck/utility/reduction_operator.hpp"
#include "config.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "device.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp"
#include "host_tensor.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "host_tensor_generator.hpp"
#include "device_tensor.hpp"
#include "ck/library/utility/check_err.hpp"
#include "device_batched_gemm_reduce_xdl_cshuffle.hpp"
#include "ck/library/host_tensor/device_memory.hpp"
#include "element_wise_operation.hpp"
#include "ck/library/host_tensor/host_tensor.hpp"
#include "reduction_operator.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp"
#include "reference_batched_gemm.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
#include "gemm_specialization.hpp"
template
<
ck
::
index_t
...
Is
>
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
S
=
ck
::
Sequence
<
Is
...
>
;
...
@@ -29,28 +31,26 @@ using ADataType = F16;
...
@@ -29,28 +31,26 @@ using ADataType = F16;
using
BDataType
=
F16
;
using
BDataType
=
F16
;
using
CDataType
=
F16
;
using
CDataType
=
F16
;
using
ReduceAccDataType
=
F32
;
using
ReduceAccDataType
=
F32
;
using
D
DataType
=
F32
;
using
Reduce
DataType
=
F32
;
using
D
PtrsGlobal
=
ck
::
Tuple
<
D
DataType
*
,
D
DataType
*>
;
using
Reduce
PtrsGlobal
=
ck
::
Tuple
<
Reduce
DataType
*
,
Reduce
DataType
*>
;
using
ALayout
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
ALayout
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
BLayout
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
using
BLayout
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
using
CLayout
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
CLayout
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
AElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
AElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
BElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
BElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
CElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
CElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
D0ReduceOp
=
ck
::
reduce
::
Add
<
ReduceAccDataType
>
;
using
ReduceOp0
=
ck
::
reduce
::
Add
;
using
D1ReduceOp
=
ck
::
reduce
::
Add
<
ReduceAccDataType
>
;
using
ReduceOp1
=
ck
::
reduce
::
Add
;
using
DxsReduceOp
=
ck
::
Tuple
<
D0ReduceOp
,
D1ReduceOp
>
;
using
ReduceOps
=
ck
::
Tuple
<
ReduceOp0
,
ReduceOp1
>
;
using
UnaryIdenticElementOp
=
using
UnaryIdenticElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
ck
::
tensor_operation
::
element_wise
::
UnaryIdentic
<
ReduceAccDataType
,
ReduceAccDataType
,
false
>
;
using
UnarySquareElementOp
=
ck
::
tensor_operation
::
element_wise
::
UnarySquare
;
using
UnarySquareElementOp
=
using
ReduceInElementOps
=
ck
::
Tuple
<
UnaryIdenticElementOp
,
UnarySquareElementOp
>
;
ck
::
tensor_operation
::
element_wise
::
UnarySquare
<
ReduceAccDataType
,
ReduceAccDataType
,
false
>
;
using
ReduceOutElementOps
=
ck
::
Tuple
<
UnaryIdenticElementOp
,
UnaryIdenticElementOp
>
;
using
DxsInElementOp
=
ck
::
Tuple
<
UnaryIdenticElementOp
,
UnarySquareElementOp
>
;
using
DxsOutElementOp
=
ck
::
Tuple
<
UnaryIdenticElementOp
,
UnaryIdenticElementOp
>
;
using
ReduceGlobalMemOps
=
using
DGlobalMemOp
=
ck
::
InMemoryDataOperationEnumSequence
<
ck
::
InMemoryDataOperationEnum
::
AtomicAdd
,
ck
::
InMemoryDataOperationEnumSequence
<
ck
::
InMemoryDataOperationEnum
::
AtomicAdd
,
ck
::
InMemoryDataOperationEnum
::
AtomicAdd
>
;
ck
::
InMemoryDataOperationEnum
::
AtomicAdd
>
;
...
@@ -63,7 +63,7 @@ using DeviceBatchedGemmReduceInstance = ck::tensor_operation::device::DeviceBatc
...
@@ -63,7 +63,7 @@ using DeviceBatchedGemmReduceInstance = ck::tensor_operation::device::DeviceBatc
//######| | | | Type| Type| Type| DataType| DataType| DataType| Type Tuple| Elementwise| Elementwise| Elementwise| Reduce| | | MemoryData| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MPerBlock| ScalarPerVector| ThreadClusterLengths| SrcDstScalarPerVector| SrcDstScalarPerVector|
//######| | | | Type| Type| Type| DataType| DataType| DataType| Type Tuple| Elementwise| Elementwise| Elementwise| Reduce| | | MemoryData| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MPerBlock| ScalarPerVector| ThreadClusterLengths| SrcDstScalarPerVector| SrcDstScalarPerVector|
//######| | | | | | | | | | | Operation| Operation| Operation| Operation| | | Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NPerBlock| _NPerBlock| _MPerBlock_NPerBlock| _NPerBlock| _MPerBlock|
//######| | | | | | | | | | | Operation| Operation| Operation| Operation| | | Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NPerBlock| _NPerBlock| _MPerBlock_NPerBlock| _NPerBlock| _MPerBlock|
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
<
Row
,
Col
,
Row
,
F16
,
F16
,
F16
,
F32
,
F32
,
F32
,
D
PtrsGlobal
,
AElementOp
,
BElementOp
,
CElementOp
,
Dxs
ReduceOp
,
Dxs
InElementOp
,
Dxs
OutElementOp
,
D
GlobalMemOp
,
GemmSpecialization
,
1
,
256
,
256
,
128
,
32
,
8
,
8
,
32
,
32
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
,
S
<
64
,
4
>
,
4
,
1
>
;
<
Row
,
Col
,
Row
,
F16
,
F16
,
F16
,
F32
,
F32
,
F32
,
Reduce
PtrsGlobal
,
AElementOp
,
BElementOp
,
CElementOp
,
ReduceOp
s
,
Reduce
InElementOp
s
,
Reduce
OutElementOp
s
,
Reduce
GlobalMemOp
s
,
GemmSpecialization
,
1
,
256
,
256
,
128
,
32
,
8
,
8
,
32
,
32
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
,
S
<
64
,
4
>
,
4
,
1
>
;
// clang-format on
// clang-format on
using
ReferenceBatchedGemmInstance
=
ck
::
tensor_operation
::
host
::
using
ReferenceBatchedGemmInstance
=
ck
::
tensor_operation
::
host
::
...
@@ -143,16 +143,16 @@ int main(int argc, char* argv[])
...
@@ -143,16 +143,16 @@ int main(int argc, char* argv[])
Tensor
<
CDataType
>
c_g_m_n_host_result
(
Tensor
<
CDataType
>
c_g_m_n_host_result
(
f_host_tensor_descriptor
(
BatchCount
,
M
,
N
,
StrideC
,
CLayout
{}));
f_host_tensor_descriptor
(
BatchCount
,
M
,
N
,
StrideC
,
CLayout
{}));
Tensor
<
D
DataType
>
d0_g_m_host_result
(
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
(
Tensor
<
Reduce
DataType
>
d0_g_m_host_result
(
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
(
{
static_cast
<
std
::
size_t
>
(
BatchCount
),
static_cast
<
std
::
size_t
>
(
M
)})));
{
static_cast
<
std
::
size_t
>
(
BatchCount
),
static_cast
<
std
::
size_t
>
(
M
)})));
Tensor
<
D
DataType
>
d1_g_m_host_result
(
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
(
Tensor
<
Reduce
DataType
>
d1_g_m_host_result
(
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
(
{
static_cast
<
std
::
size_t
>
(
BatchCount
),
static_cast
<
std
::
size_t
>
(
M
)})));
{
static_cast
<
std
::
size_t
>
(
BatchCount
),
static_cast
<
std
::
size_t
>
(
M
)})));
Tensor
<
CDataType
>
c_g_m_n_device_result
(
Tensor
<
CDataType
>
c_g_m_n_device_result
(
f_host_tensor_descriptor
(
BatchCount
,
M
,
N
,
StrideC
,
CLayout
{}));
f_host_tensor_descriptor
(
BatchCount
,
M
,
N
,
StrideC
,
CLayout
{}));
Tensor
<
D
DataType
>
d0_g_m_device_result
(
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
(
Tensor
<
Reduce
DataType
>
d0_g_m_device_result
(
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
(
{
static_cast
<
std
::
size_t
>
(
BatchCount
),
static_cast
<
std
::
size_t
>
(
M
)})));
{
static_cast
<
std
::
size_t
>
(
BatchCount
),
static_cast
<
std
::
size_t
>
(
M
)})));
Tensor
<
D
DataType
>
d1_g_m_device_result
(
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
(
Tensor
<
Reduce
DataType
>
d1_g_m_device_result
(
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
(
{
static_cast
<
std
::
size_t
>
(
BatchCount
),
static_cast
<
std
::
size_t
>
(
M
)})));
{
static_cast
<
std
::
size_t
>
(
BatchCount
),
static_cast
<
std
::
size_t
>
(
M
)})));
std
::
cout
<<
"a_g_m_k: "
<<
a_g_m_k
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"a_g_m_k: "
<<
a_g_m_k
.
mDesc
<<
std
::
endl
;
...
@@ -177,38 +177,48 @@ int main(int argc, char* argv[])
...
@@ -177,38 +177,48 @@ int main(int argc, char* argv[])
DeviceMem
a_device_buf
(
sizeof
(
ADataType
)
*
a_g_m_k
.
mDesc
.
GetElementSpace
());
DeviceMem
a_device_buf
(
sizeof
(
ADataType
)
*
a_g_m_k
.
mDesc
.
GetElementSpace
());
DeviceMem
b_device_buf
(
sizeof
(
BDataType
)
*
b_g_k_n
.
mDesc
.
GetElementSpace
());
DeviceMem
b_device_buf
(
sizeof
(
BDataType
)
*
b_g_k_n
.
mDesc
.
GetElementSpace
());
DeviceMem
c_device_buf
(
sizeof
(
CDataType
)
*
c_g_m_n_device_result
.
mDesc
.
GetElementSpace
());
DeviceMem
c_device_buf
(
sizeof
(
CDataType
)
*
c_g_m_n_device_result
.
mDesc
.
GetElementSpace
());
DeviceMem
d0_device_buf
(
sizeof
(
DDataType
)
*
d0_g_m_device_result
.
mDesc
.
GetElementSpace
());
DeviceMem
reduce0_device_buf
(
sizeof
(
ReduceDataType
)
*
DeviceMem
d1_device_buf
(
sizeof
(
DDataType
)
*
d1_g_m_device_result
.
mDesc
.
GetElementSpace
());
d0_g_m_device_result
.
mDesc
.
GetElementSpace
());
DeviceMem
reduce1_device_buf
(
sizeof
(
ReduceDataType
)
*
d1_g_m_device_result
.
mDesc
.
GetElementSpace
());
a_device_buf
.
ToDevice
(
a_g_m_k
.
mData
.
data
());
a_device_buf
.
ToDevice
(
a_g_m_k
.
mData
.
data
());
b_device_buf
.
ToDevice
(
b_g_k_n
.
mData
.
data
());
b_device_buf
.
ToDevice
(
b_g_k_n
.
mData
.
data
());
auto
a_element_op
=
AElementOp
{};
auto
a_element_op
=
AElementOp
{};
auto
b_element_op
=
BElementOp
{};
auto
b_element_op
=
BElementOp
{};
auto
c_element_op
=
CElementOp
{};
auto
c_element_op
=
CElementOp
{};
auto
dxs_global
=
ck
::
make_tuple
(
static_cast
<
DDataType
*>
(
d0_device_buf
.
GetDeviceBuffer
()),
std
::
array
<
void
*
,
3
>
gemm_element_ops
=
{
&
a_element_op
,
&
b_element_op
,
&
c_element_op
};
static_cast
<
DDataType
*>
(
d1_device_buf
.
GetDeviceBuffer
()));
auto
passthrough
=
UnaryIdenticElementOp
{};
auto
square
=
UnarySquareElementOp
{};
std
::
array
<
void
*
,
2
>
reduce_in_element_ops
=
{
&
passthrough
,
&
square
};
std
::
array
<
void
*
,
2
>
reduce_out_element_ops
=
{
&
passthrough
,
&
passthrough
};
std
::
array
<
void
*
,
2
>
p_reduces
=
{
reduce0_device_buf
.
GetDeviceBuffer
(),
reduce1_device_buf
.
GetDeviceBuffer
()};
// do GEMM
// do GEMM
auto
batched_gemm
=
DeviceBatchedGemmReduceInstance
{};
auto
batched_gemm
=
DeviceBatchedGemmReduceInstance
{};
auto
invoker
=
batched_gemm
.
MakeInvoker
();
auto
invoker
=
batched_gemm
.
MakeInvoker
();
auto
argument
=
auto
argument
=
batched_gemm
.
MakeArgument
(
a_device_buf
.
GetDeviceBuffer
(),
batched_gemm
.
MakeArgument
(
static_cast
<
ADataType
*>
(
a_device_buf
.
GetDeviceBuffer
()),
b_device_buf
.
GetDeviceBuffer
(),
static_cast
<
BDataType
*>
(
b_device_buf
.
GetDeviceBuffer
()),
nullptr
,
static_cast
<
CDataType
*>
(
c_device_buf
.
GetDeviceBuffer
()),
{},
dxs_global
,
c_device_buf
.
GetDeviceBuffer
(),
M
,
p_reduces
,
N
,
M
,
K
,
N
,
StrideA
,
K
,
StrideB
,
StrideA
,
StrideC
,
StrideB
,
a_element_op
,
StrideC
,
b_element_op
,
{},
c_element_op
,
gemm_element_ops
,
DxsInElementOp
{},
{},
DxsOutElementOp
{},
reduce_in_element_ops
,
BatchCount
);
reduce_out_element_ops
,
BatchCount
);
if
(
!
batched_gemm
.
IsSupportedArgument
(
argument
))
if
(
!
batched_gemm
.
IsSupportedArgument
(
argument
))
{
{
...
@@ -218,8 +228,8 @@ int main(int argc, char* argv[])
...
@@ -218,8 +228,8 @@ int main(int argc, char* argv[])
}
}
// init DO, D1 to 0
// init DO, D1 to 0
d
0_device_buf
.
SetZero
();
reduce
0_device_buf
.
SetZero
();
d
1_device_buf
.
SetZero
();
reduce
1_device_buf
.
SetZero
();
// if time_kernel == true, kernel will run multiple times. This kernel use atomic-add so result
// if time_kernel == true, kernel will run multiple times. This kernel use atomic-add so result
// will not be correct. need to set time_kernel = false for correctness test
// will not be correct. need to set time_kernel = false for correctness test
...
@@ -241,8 +251,8 @@ int main(int argc, char* argv[])
...
@@ -241,8 +251,8 @@ int main(int argc, char* argv[])
if
(
do_verification
)
if
(
do_verification
)
{
{
c_device_buf
.
FromDevice
(
c_g_m_n_device_result
.
mData
.
data
());
c_device_buf
.
FromDevice
(
c_g_m_n_device_result
.
mData
.
data
());
d
0_device_buf
.
FromDevice
(
d0_g_m_device_result
.
mData
.
data
());
reduce
0_device_buf
.
FromDevice
(
d0_g_m_device_result
.
mData
.
data
());
d
1_device_buf
.
FromDevice
(
d1_g_m_device_result
.
mData
.
data
());
reduce
1_device_buf
.
FromDevice
(
d1_g_m_device_result
.
mData
.
data
());
auto
ref_batched_gemm
=
ReferenceBatchedGemmInstance
{};
auto
ref_batched_gemm
=
ReferenceBatchedGemmInstance
{};
auto
ref_invoker
=
ref_batched_gemm
.
MakeInvoker
();
auto
ref_invoker
=
ref_batched_gemm
.
MakeInvoker
();
...
@@ -252,30 +262,31 @@ int main(int argc, char* argv[])
...
@@ -252,30 +262,31 @@ int main(int argc, char* argv[])
ref_invoker
.
Run
(
ref_argument
);
ref_invoker
.
Run
(
ref_argument
);
auto
d0_
reduce_op
=
D0
ReduceOp
{};
auto
reduce
0
_op
=
ReduceOp
0
{};
auto
d1_
reduce_op
=
D1
ReduceOp
{};
auto
reduce
1
_op
=
ReduceOp
1
{};
for
(
int
batch
=
0
;
batch
<
BatchCount
;
++
batch
)
for
(
int
batch
=
0
;
batch
<
BatchCount
;
++
batch
)
{
{
for
(
int
m
=
0
;
m
<
M
;
++
m
)
for
(
int
m
=
0
;
m
<
M
;
++
m
)
{
{
float
d
0_acc
=
d0_
reduce_op
.
GetIdentityValue
();
auto
reduce
0_acc
=
reduce
0
_op
.
GetIdentityValue
<
ReduceAccDataType
>
();
float
d
1_acc
=
d1_
reduce_op
.
GetIdentityValue
();
auto
reduce
1_acc
=
reduce
1
_op
.
GetIdentityValue
<
ReduceAccDataType
>
();
for
(
int
n
=
0
;
n
<
N
;
++
n
)
for
(
int
n
=
0
;
n
<
N
;
++
n
)
{
{
float
c_val
=
ck
::
type_convert
<
float
>
(
c_g_m_n_host_result
(
batch
,
m
,
n
));
auto
c_val
=
float
d0_val
=
0
;
ck
::
type_convert
<
ReduceAccDataType
>
(
c_g_m_n_host_result
(
batch
,
m
,
n
));
float
d1_val
=
0
;
ReduceAccDataType
d0_val
;
ReduceAccDataType
d1_val
;
UnaryIdenticElementOp
{}(
d0_val
,
c_val
);
UnaryIdenticElementOp
{}(
d0_val
,
c_val
);
UnarySquareElementOp
{}(
d1_val
,
c_val
);
UnarySquareElementOp
{}(
d1_val
,
c_val
);
d0_
reduce_op
(
d
0_acc
,
d0_val
);
reduce
0
_op
(
reduce
0_acc
,
d0_val
);
d1_
reduce_op
(
d
1_acc
,
d1_val
);
reduce
1
_op
(
reduce
1_acc
,
d1_val
);
}
}
d0_g_m_host_result
(
batch
,
m
)
=
ck
::
type_convert
<
D
DataType
>
(
d
0_acc
);
d0_g_m_host_result
(
batch
,
m
)
=
ck
::
type_convert
<
Reduce
DataType
>
(
reduce
0_acc
);
d1_g_m_host_result
(
batch
,
m
)
=
ck
::
type_convert
<
D
DataType
>
(
d
1_acc
);
d1_g_m_host_result
(
batch
,
m
)
=
ck
::
type_convert
<
Reduce
DataType
>
(
reduce
1_acc
);
}
}
}
}
...
...
example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp
View file @
b79df771
/*******************************************************************************
// SPDX-License-Identifier: MIT
*
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
* MIT License
*
* Copyright (c) 2022 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include <iostream>
#include <iostream>
#include <cstdlib>
#include <cstdlib>
#include "check_err.hpp"
#include "config.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "device_tensor.hpp"
#include "ck/ck.hpp"
#include "binary_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
#include "device_binary_elementwise.hpp"
#include "ck/tensor_operation/gpu/device/device_binary_elementwise.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/host_tensor/device_memory.hpp"
#include "ck/library/host_tensor/host_tensor.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp"
using
F16
=
ck
::
half_t
;
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
F32
=
float
;
...
@@ -42,8 +20,7 @@ using ABDataType = F16;
...
@@ -42,8 +20,7 @@ using ABDataType = F16;
using
CDataType
=
F16
;
using
CDataType
=
F16
;
using
EltwiseComputeDataType
=
F32
;
using
EltwiseComputeDataType
=
F32
;
using
Add
=
ck
::
tensor_operation
::
binary_element_wise
::
using
Add
=
ck
::
tensor_operation
::
element_wise
::
Add
;
Add
<
EltwiseComputeDataType
,
EltwiseComputeDataType
,
EltwiseComputeDataType
>
;
using
DeviceElementwiseAddInstance
=
using
DeviceElementwiseAddInstance
=
ck
::
tensor_operation
::
device
::
DeviceBinaryElementwise
<
ABDataType
,
ck
::
tensor_operation
::
device
::
DeviceBinaryElementwise
<
ABDataType
,
...
@@ -122,15 +99,17 @@ int main()
...
@@ -122,15 +99,17 @@ int main()
a_m_n_device_buf
.
ToDevice
(
a_m_n
.
mData
.
data
());
a_m_n_device_buf
.
ToDevice
(
a_m_n
.
mData
.
data
());
b_n_device_buf
.
ToDevice
(
b_n
.
mData
.
data
());
b_n_device_buf
.
ToDevice
(
b_n
.
mData
.
data
());
std
::
array
<
const
void
*
,
2
>
input
=
{
a_m_n_device_buf
.
GetDeviceBuffer
(),
b_n_device_buf
.
GetDeviceBuffer
()};
std
::
array
<
void
*
,
1
>
output
=
{
c_m_n_device_buf
.
GetDeviceBuffer
()};
std
::
vector
<
ck
::
index_t
>
a_strides
=
{
Stride
,
1
};
std
::
vector
<
ck
::
index_t
>
b_strides
=
{
0
,
1
};
std
::
vector
<
ck
::
index_t
>
c_strides
=
{
Stride
,
1
};
auto
broadcastAdd
=
DeviceElementwiseAddInstance
{};
auto
broadcastAdd
=
DeviceElementwiseAddInstance
{};
auto
argument
=
broadcastAdd
.
MakeArgumentPointer
(
a_m_n_device_buf
.
GetDeviceBuffer
(),
auto
argument
=
broadcastAdd
.
MakeArgumentPointer
(
b_n_device_buf
.
GetDeviceBuffer
(),
input
,
output
,
{
M
,
N
},
{
a_strides
,
b_strides
},
{
c_strides
},
Add
{});
c_m_n_device_buf
.
GetDeviceBuffer
(),
{
M
,
N
},
{
Stride
,
1
},
{
0
,
1
},
// broadcast in first dimension
{
Stride
,
1
},
Add
{});
if
(
!
broadcastAdd
.
IsSupportedArgument
(
argument
.
get
()))
if
(
!
broadcastAdd
.
IsSupportedArgument
(
argument
.
get
()))
{
{
...
...
example/19_binary_elementwise/broadcast_add_3d_am_bmnk.cpp
View file @
b79df771
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <iostream>
#include <cstdlib>
#include <cstdlib>
#include "check_err.hpp"
#include "config.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "device_tensor.hpp"
#include "ck/ck.hpp"
#include "binary_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
#include "device_binary_elementwise.hpp"
#include "ck/tensor_operation/gpu/device/device_binary_elementwise.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/host_tensor/device_memory.hpp"
#include "ck/library/host_tensor/host_tensor.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp"
using
F16
=
ck
::
half_t
;
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
F32
=
float
;
...
@@ -17,8 +20,7 @@ using ABDataType = F16;
...
@@ -17,8 +20,7 @@ using ABDataType = F16;
using
CDataType
=
F16
;
using
CDataType
=
F16
;
using
EltwiseComputeDataType
=
F32
;
using
EltwiseComputeDataType
=
F32
;
using
Add
=
ck
::
tensor_operation
::
binary_element_wise
::
using
Add
=
ck
::
tensor_operation
::
element_wise
::
Add
;
Add
<
EltwiseComputeDataType
,
EltwiseComputeDataType
,
EltwiseComputeDataType
>
;
using
DeviceElementwiseAddInstance
=
using
DeviceElementwiseAddInstance
=
ck
::
tensor_operation
::
device
::
DeviceBinaryElementwise
<
ABDataType
,
ck
::
tensor_operation
::
device
::
DeviceBinaryElementwise
<
ABDataType
,
...
@@ -79,18 +81,24 @@ int main()
...
@@ -79,18 +81,24 @@ int main()
a_m_device_buf
.
ToDevice
(
a_m
.
mData
.
data
());
a_m_device_buf
.
ToDevice
(
a_m
.
mData
.
data
());
b_m_n_k_device_buf
.
ToDevice
(
b_m_n_k
.
mData
.
data
());
b_m_n_k_device_buf
.
ToDevice
(
b_m_n_k
.
mData
.
data
());
std
::
array
<
const
void
*
,
2
>
input
=
{
a_m_device_buf
.
GetDeviceBuffer
(),
b_m_n_k_device_buf
.
GetDeviceBuffer
()};
std
::
array
<
void
*
,
1
>
output
=
{
c_m_n_k_device_buf
.
GetDeviceBuffer
()};
std
::
vector
<
ck
::
index_t
>
a_strides
=
{
1
,
0
,
0
};
std
::
vector
<
ck
::
index_t
>
b_strides
{
b_m_n_k
.
mDesc
.
GetStrides
().
begin
(),
b_m_n_k
.
mDesc
.
GetStrides
().
end
()};
std
::
vector
<
ck
::
index_t
>
c_strides
{
c_m_n_k
.
mDesc
.
GetStrides
().
begin
(),
c_m_n_k
.
mDesc
.
GetStrides
().
end
()};
auto
broadcastAdd
=
DeviceElementwiseAddInstance
{};
auto
broadcastAdd
=
DeviceElementwiseAddInstance
{};
auto
argument
=
broadcastAdd
.
MakeArgumentPointer
(
auto
argument
=
a_m_device_buf
.
GetDeviceBuffer
(),
broadcastAdd
.
MakeArgumentPointer
(
input
,
b_m_n_k_device_buf
.
GetDeviceBuffer
(),
output
,
c_m_n_k_device_buf
.
GetDeviceBuffer
(),
std
::
vector
<
ck
::
index_t
>
{
mnk
.
begin
(),
mnk
.
end
()},
std
::
vector
<
ck
::
index_t
>
{
mnk
.
begin
(),
mnk
.
end
()},
{
a_strides
,
b_strides
},
{
1
,
0
,
0
},
// broadcast A on second and third dimension
{
c_strides
},
std
::
vector
<
ck
::
index_t
>
{
b_m_n_k
.
mDesc
.
GetStrides
().
begin
(),
Add
{});
b_m_n_k
.
mDesc
.
GetStrides
().
end
()},
std
::
vector
<
ck
::
index_t
>
{
c_m_n_k
.
mDesc
.
GetStrides
().
begin
(),
c_m_n_k
.
mDesc
.
GetStrides
().
end
()},
Add
{});
if
(
!
broadcastAdd
.
IsSupportedArgument
(
argument
.
get
()))
if
(
!
broadcastAdd
.
IsSupportedArgument
(
argument
.
get
()))
{
{
...
...
example/19_binary_elementwise/elementwise_add_1d.cpp
View file @
b79df771
/*******************************************************************************
// SPDX-License-Identifier: MIT
*
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
* MIT License
*
* Copyright (c) 2022 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include <iostream>
#include <iostream>
#include <cstdlib>
#include <cstdlib>
#include "check_err.hpp"
#include "config.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "device_tensor.hpp"
#include "ck/ck.hpp"
#include "binary_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/device_binary_elementwise.hpp"
#include "device_binary_elementwise.hpp"
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/host_tensor/device_memory.hpp"
#include "ck/library/host_tensor/host_tensor.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp"
using
F16
=
ck
::
half_t
;
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
F32
=
float
;
...
@@ -42,8 +19,7 @@ using ABDataType = F16;
...
@@ -42,8 +19,7 @@ using ABDataType = F16;
using
CDataType
=
F16
;
using
CDataType
=
F16
;
using
EltwiseComputeDataType
=
F32
;
using
EltwiseComputeDataType
=
F32
;
using
Add
=
ck
::
tensor_operation
::
binary_element_wise
::
using
Add
=
ck
::
tensor_operation
::
element_wise
::
Add
;
Add
<
EltwiseComputeDataType
,
EltwiseComputeDataType
,
EltwiseComputeDataType
>
;
using
DeviceElementwiseAddInstance
=
using
DeviceElementwiseAddInstance
=
ck
::
tensor_operation
::
device
::
DeviceBinaryElementwise
<
ABDataType
,
ck
::
tensor_operation
::
device
::
DeviceBinaryElementwise
<
ABDataType
,
...
@@ -103,15 +79,17 @@ int main()
...
@@ -103,15 +79,17 @@ int main()
a_m_device_buf
.
ToDevice
(
a_m
.
mData
.
data
());
a_m_device_buf
.
ToDevice
(
a_m
.
mData
.
data
());
b_m_device_buf
.
ToDevice
(
b_m
.
mData
.
data
());
b_m_device_buf
.
ToDevice
(
b_m
.
mData
.
data
());
std
::
array
<
const
void
*
,
2
>
input
=
{
a_m_device_buf
.
GetDeviceBuffer
(),
b_m_device_buf
.
GetDeviceBuffer
()};
std
::
array
<
void
*
,
1
>
output
=
{
c_m_device_buf
.
GetDeviceBuffer
()};
std
::
vector
<
ck
::
index_t
>
a_strides
=
{
1
};
std
::
vector
<
ck
::
index_t
>
b_strides
=
{
1
};
std
::
vector
<
ck
::
index_t
>
c_strides
=
{
1
};
auto
broadcastAdd
=
DeviceElementwiseAddInstance
{};
auto
broadcastAdd
=
DeviceElementwiseAddInstance
{};
auto
argument
=
broadcastAdd
.
MakeArgumentPointer
(
a_m_device_buf
.
GetDeviceBuffer
(),
auto
argument
=
broadcastAdd
.
MakeArgumentPointer
(
b_m_device_buf
.
GetDeviceBuffer
(),
input
,
output
,
{
M
},
{{
a_strides
},
b_strides
},
{
c_strides
},
Add
{});
c_m_device_buf
.
GetDeviceBuffer
(),
{
M
},
{
1
},
{
1
},
{
1
},
Add
{});
if
(
!
broadcastAdd
.
IsSupportedArgument
(
argument
.
get
()))
if
(
!
broadcastAdd
.
IsSupportedArgument
(
argument
.
get
()))
{
{
...
...
Prev
1
2
3
4
5
6
7
…
35
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment