Commit ef326c73 authored by Alan Turner's avatar Alan Turner
Browse files

Merge remote-tracking branch 'origin/develop' into migraphx-update

parents b7775add e4dfe4d8
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream> #include <iostream>
#include <cstdlib> #include <cstdlib>
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp" #include "ck/tensor_operation/gpu/device/impl/device_elementwise_dynamic_vector_dims_impl.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
...@@ -27,7 +27,12 @@ using DeviceElementwiseAddInstance = ...@@ -27,7 +27,12 @@ using DeviceElementwiseAddInstance =
ck::Tuple<CDataType>, ck::Tuple<CDataType>,
Add, Add,
2, 2,
64,
64,
64,
8, 8,
8,
ck::Sequence<1, 0>,
ck::Sequence<8, 8>, ck::Sequence<8, 8>,
ck::Sequence<8>>; ck::Sequence<8>>;
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream> #include <iostream>
#include <cstdlib> #include <cstdlib>
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp" #include "ck/tensor_operation/gpu/device/impl/device_elementwise_dynamic_vector_dims_impl.hpp"
#include "ck/library/utility/algorithm.hpp" #include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
...@@ -27,9 +27,14 @@ using DeviceElementwiseAddInstance = ...@@ -27,9 +27,14 @@ using DeviceElementwiseAddInstance =
ck::Tuple<CDataType>, ck::Tuple<CDataType>,
Add, Add,
3, 3,
8, 64,
ck::Sequence<1, 8>, 16,
ck::Sequence<8>>; 16,
2,
2,
ck::Sequence<1, 0>,
ck::Sequence<1, 2>,
ck::Sequence<2>>;
template <typename HostTensorA, typename HostTensorB, typename HostTensorC, typename Functor> template <typename HostTensorA, typename HostTensorB, typename HostTensorC, typename Functor>
void host_broadcast3D_am_bmnk(HostTensorC& C, void host_broadcast3D_am_bmnk(HostTensorC& C,
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream> #include <iostream>
#include <cstdlib> #include <cstdlib>
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp" #include "ck/tensor_operation/gpu/device/impl/device_elementwise_dynamic_vector_dims_impl.hpp"
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
...@@ -25,9 +25,14 @@ using DeviceElementwiseAddInstance = ...@@ -25,9 +25,14 @@ using DeviceElementwiseAddInstance =
ck::Tuple<CDataType>, ck::Tuple<CDataType>,
Add, Add,
1, 1,
8, 64,
ck::Sequence<8, 8>, 16,
ck::Sequence<8>>; 16,
2,
2,
ck::Sequence<1, 0>,
ck::Sequence<2, 2>,
ck::Sequence<2>>;
template <typename HostTensorA, typename HostTensorB, typename HostTensorC, typename Functor> template <typename HostTensorA, typename HostTensorB, typename HostTensorC, typename Functor>
void host_elementwise1D( void host_elementwise1D(
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream> #include <iostream>
#include <cstdlib> #include <cstdlib>
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp" #include "ck/tensor_operation/gpu/device/impl/device_elementwise_dynamic_vector_dims_impl.hpp"
#include "ck/library/utility/algorithm.hpp" #include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
...@@ -27,9 +27,14 @@ using DeviceElementwiseAddInstance = ...@@ -27,9 +27,14 @@ using DeviceElementwiseAddInstance =
ck::Tuple<CDataType>, ck::Tuple<CDataType>,
Add, Add,
4, 4,
8, 64,
ck::Sequence<8, 8>, 2,
ck::Sequence<8>>; 128,
2,
2,
ck::Sequence<1, 0>,
ck::Sequence<2, 2>,
ck::Sequence<2>>;
template <typename HostTensorA, typename HostTensorB, typename HostTensorC, typename Functor> template <typename HostTensorA, typename HostTensorB, typename HostTensorC, typename Functor>
void host_elementwise4D(HostTensorC& C, void host_elementwise4D(HostTensorC& C,
......
list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942) add_custom_target(example_grouped_conv_bwd_weight)
set(target 0) add_example_executable(example_grouped_conv_bwd_weight_xdl_fp16 grouped_conv_bwd_weight_xdl_fp16.cpp)
foreach(gpu IN LISTS GPU_TARGETS) add_example_dependencies(example_grouped_conv_bwd_weight example_grouped_conv_bwd_weight_xdl_fp16)
if(gpu IN_LIST gpu_list AND target EQUAL 0)
add_custom_target(example_grouped_conv_bwd_weight)
if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
add_example_executable(example_grouped_conv_bwd_weight_xdl_fp16 grouped_conv_bwd_weight_xdl_fp16.cpp)
add_dependencies(example_grouped_conv_bwd_weight example_grouped_conv_bwd_weight_xdl_fp16)
endif()
if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES)
add_example_executable(example_grouped_conv_bwd_weight_xdl_bf16 grouped_conv_bwd_weight_xdl_bf16.cpp)
add_dependencies(example_grouped_conv_bwd_weight example_grouped_conv_bwd_weight_xdl_bf16)
endif()
set(target 1)
endif()
endforeach()
if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) add_example_executable(example_grouped_conv_bwd_weight_xdl_bf16 grouped_conv_bwd_weight_xdl_bf16.cpp)
if(DL_KERNELS) add_example_dependencies(example_grouped_conv_bwd_weight example_grouped_conv_bwd_weight_xdl_bf16)
add_custom_target(example_grouped_conv_bwd_weight_dl)
add_example_executable(example_grouped_conv_bwd_weight_dl_fp16 grouped_conv_bwd_weight_dl_fp16.cpp) add_example_executable(example_grouped_conv_bwd_weight_xdl_fp16_comp_bf8_fp8 grouped_conv_bwd_weight_xdl_fp16_comp_bf8_fp8.cpp)
add_dependencies(example_grouped_conv_bwd_weight_dl example_grouped_conv_bwd_weight_dl_fp16) add_example_dependencies(example_grouped_conv_bwd_weight example_grouped_conv_bwd_weight_xdl_fp16_comp_bf8_fp8)
endif()
endif() add_example_executable(example_grouped_conv_bwd_weight_wmma_fp16 grouped_conv_bwd_weight_wmma_fp16.cpp)
\ No newline at end of file add_example_dependencies(example_grouped_conv_bwd_weight example_grouped_conv_bwd_weight_wmma_fp16)
add_example_executable(example_grouped_conv_bwd_weight_dl_fp16 grouped_conv_bwd_weight_dl_fp16.cpp)
add_example_dependencies(example_grouped_conv_bwd_weight example_grouped_conv_bwd_weight_dl_fp16)
...@@ -23,6 +23,8 @@ ...@@ -23,6 +23,8 @@
using BF16 = ck::bhalf_t; using BF16 = ck::bhalf_t;
using F16 = ck::half_t; using F16 = ck::half_t;
using F32 = float; using F32 = float;
using F8 = ck::f8_t;
using BF8 = ck::bf8_t;
template <ck::index_t... Is> template <ck::index_t... Is>
using S = ck::Sequence<Is...>; using S = ck::Sequence<Is...>;
...@@ -40,25 +42,21 @@ struct CommonLayoutSetting ...@@ -40,25 +42,21 @@ struct CommonLayoutSetting
using OutputLayout = OutputLay; using OutputLayout = OutputLay;
}; };
template <ck::index_t NDimSpatial>
struct CommonLayoutSettingSelector;
namespace ctl = ck::tensor_layout::convolution; namespace ctl = ck::tensor_layout::convolution;
template <ck::index_t NDimSpatial>
template <> struct CommonLayoutSettingSelector
struct CommonLayoutSettingSelector<1> final : CommonLayoutSetting<ctl::GNWC, ctl::GKXC, ctl::GNWK> : CommonLayoutSetting<ck::tuple_element_t<NDimSpatial - 1,
{ ck::Tuple<ck::tensor_layout::convolution::GNWC,
}; ck::tensor_layout::convolution::GNHWC,
ck::tensor_layout::convolution::GNDHWC>>,
template <> ck::tuple_element_t<NDimSpatial - 1,
struct CommonLayoutSettingSelector<2> final ck::Tuple<ck::tensor_layout::convolution::GKXC,
: CommonLayoutSetting<ctl::GNHWC, ctl::GKYXC, ctl::GNHWK> ck::tensor_layout::convolution::GKYXC,
{ ck::tensor_layout::convolution::GKZYXC>>,
}; ck::tuple_element_t<NDimSpatial - 1,
ck::Tuple<ck::tensor_layout::convolution::GNWK,
template <> ck::tensor_layout::convolution::GNHWK,
struct CommonLayoutSettingSelector<3> final ck::tensor_layout::convolution::GNDHWK>>>
: CommonLayoutSetting<ctl::GNDHWC, ctl::GKZYXC, ctl::GNDHWK>
{ {
}; };
...@@ -78,10 +76,10 @@ struct ExecutionConfig final ...@@ -78,10 +76,10 @@ struct ExecutionConfig final
bool time_kernel = false; bool time_kernel = false;
}; };
#define DefaultConvParam \ #define DefaultConvParam \
ck::utils::conv::ConvParam \ ck::utils::conv::ConvParam \
{ \ { \
2, 4, 1, 128, 256, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, { 1, 1 } \ 3, 4, 1, 128, 256, {3, 3, 3}, {14, 14, 14}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, { 1, 1, 1 } \
} }
inline void print_help_msg() inline void print_help_msg()
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
#include "common.hpp" #include "common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_dl.hpp" #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_dl.hpp"
using InDataType = F16; using InDataType = F16;
using WeiDataType = F16; using WeiDataType = F16;
...@@ -15,45 +15,84 @@ using WeiElementOp = PassThrough; ...@@ -15,45 +15,84 @@ using WeiElementOp = PassThrough;
using OutElementOp = PassThrough; using OutElementOp = PassThrough;
template <ck::index_t NDimSpatial> template <ck::index_t NDimSpatial>
using DeviceConvBwdWeightInstance = using DeviceConvBwdWeightInstance = ck::tensor_operation::device::DeviceGroupedConvBwdWeight_Dl<
ck::tensor_operation::device::DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Dl< NDimSpatial, // NDimSpatial
NDimSpatial, // NDimSpatial ck::tuple_element_t<NDimSpatial - 1,
InDataType, // InDataType ck::Tuple<ck::tensor_layout::convolution::GNWC,
WeiDataType, // WeiDataType ck::tensor_layout::convolution::GNHWC,
OutDataType, // OutDataType ck::tensor_layout::convolution::GNDHWC>>, // InLayout
AccDataType, // AccDataType ck::tuple_element_t<NDimSpatial - 1,
InElementOp, // InElementwiseOperation ck::Tuple<ck::tensor_layout::convolution::GKXC,
WeiElementOp, // WeiElementwiseOperation ck::tensor_layout::convolution::GKYXC,
OutElementOp, // OutElementwiseOperation ck::tensor_layout::convolution::GKZYXC>>, // WeiLayout
ConvBwdWeightDefault, // ConvBackwardWeightSpecialization ck::tuple_element_t<NDimSpatial - 1,
256, // BlockSize ck::Tuple<ck::tensor_layout::convolution::GNWK,
128, // MPerBlock ck::tensor_layout::convolution::GNHWK,
128, // NPerBlock ck::tensor_layout::convolution::GNDHWK>>, // OutLayout
16, // K0PerBlock InDataType, // InDataType
2, // K1 WeiDataType, // WeiDataType
4, // M1PerThread OutDataType, // OutDataType
4, // N1PerThread AccDataType, // AccDataType
1, // KPerThread InElementOp, // InElementwiseOperation
S<8, 2>, // M1N1ThreadClusterM1Xs WeiElementOp, // WeiElementwiseOperation
S<8, 2>, // M1N1ThreadClusterN1Xs OutElementOp, // OutElementwiseOperation
S<1, 8, 1, 1, 2>, // ABlockTransferThreadSliceLengths_K0_M0_M1_K1 ConvBwdWeightDefault, // ConvBackwardWeightSpecialization
S<1, 2, 1, 128, 1>, // ABlockTransferThreadClusterLengths_K0_M0_M1_K1 256, // BlockSize
S<0, 2, 3, 1, 4>, // ABlockTransferThreadClusterArrangeOrder 128, // MPerBlock
S<0, 2, 3, 1, 4>, // ABlockTransferSrcAccessOrder 128, // NPerBlock
S<1, 1, 1, 1, 1>, // ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1 16, // K0PerBlock
S<0, 2, 3, 1, 4>, // ABlockTransferSrcVectorTensorContiguousDimOrder 2, // K1
S<1, 1, 1, 1, 1>, // ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1 4, // M1PerThread
S<1, 1, 1, 8, 2>, // BBlockTransferThreadSliceLengths_K0_N0_N1_K1 4, // N1PerThread
S<1, 16, 1, 16, 1>, // BBlockTransferThreadClusterLengths_K0_N0_N1_K1 1, // KPerThread
S<0, 1, 4, 2, 3>, // BBlockTransferThreadClusterArrangeOrder S<8, 2>, // M1N1ThreadClusterM1Xs
S<0, 1, 4, 2, 3>, // BBlockTransferSrcAccessOrder S<8, 2>, // M1N1ThreadClusterN1Xs
S<1, 1, 1, 8, 1>, // BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1 S<1, 8, 1, 1, 2>, // ABlockTransferThreadSliceLengths_K0_M0_M1_K1
S<0, 1, 4, 2, 3>, // BBlockTransferSrcVectorTensorContiguousDimOrder S<1, 2, 1, 128, 1>, // ABlockTransferThreadClusterLengths_K0_M0_M1_K1
S<1, 1, 1, 1, 2>, // BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1 S<0, 2, 3, 1, 4>, // ABlockTransferThreadClusterArrangeOrder
S<0, 1, 2, 3, 4, 5>, // CThreadTransferSrcDstAccessOrder S<0, 2, 3, 1, 4>, // ABlockTransferSrcAccessOrder
5, // CThreadTransferSrcDstVectorDim S<1, 1, 1, 1, 1>, // ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1
4>; // CThreadTransferDstScalarPerVector S<0, 2, 3, 1, 4>, // ABlockTransferSrcVectorTensorContiguousDimOrder
S<1, 1, 1, 1, 1>, // ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1
S<1, 1, 1, 8, 2>, // BBlockTransferThreadSliceLengths_K0_N0_N1_K1
S<1, 16, 1, 16, 1>, // BBlockTransferThreadClusterLengths_K0_N0_N1_K1
S<0, 1, 4, 2, 3>, // BBlockTransferThreadClusterArrangeOrder
S<0, 1, 4, 2, 3>, // BBlockTransferSrcAccessOrder
S<1, 1, 1, 8, 1>, // BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1
S<0, 1, 4, 2, 3>, // BBlockTransferSrcVectorTensorContiguousDimOrder
S<1, 1, 1, 1, 2>, // BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1
S<0, 1, 2, 3, 4, 5>, // CThreadTransferSrcDstAccessOrder
5, // CThreadTransferSrcDstVectorDim
4>; // CThreadTransferDstScalarPerVector
template <ck::index_t NDimSpatial>
using HostConvBwdWeightInstance = ck::tensor_operation::host::ReferenceConvBwdWeight<NDimSpatial,
InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp>;
#include "run_grouped_conv_bwd_weight_example.inc" #include "run_grouped_conv_bwd_weight_example.inc"
int main(int argc, char* argv[]) { return !run_grouped_conv_bwd_weight_example(argc, argv); } int main(int argc, char* argv[])
{
ExecutionConfig config;
ck::utils::conv::ConvParam conv_param = DefaultConvParam;
if(!parse_cmd_args(argc, argv, config, conv_param))
{
return 1;
}
switch(conv_param.num_dim_spatial_)
{
case 1: return !run_grouped_conv_bwd_weight<1>(config, conv_param);
case 2: return !run_grouped_conv_bwd_weight<2>(config, conv_param);
case 3: return !run_grouped_conv_bwd_weight<3>(config, conv_param);
default: break;
}
return 1;
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_wmma_cshuffle.hpp"
using InDataType = F16;
using WeiDataType = F16;
using OutDataType = F16;
using AccDataType = F32;
using InElementOp = PassThrough;
using WeiElementOp = PassThrough;
using OutElementOp = PassThrough;
template <ck::index_t NDimSpatial>
using DeviceConvBwdWeightInstance =
ck::tensor_operation::device::DeviceGroupedConvBwdWeight_Wmma_CShuffle<
NDimSpatial,
ck::tensor_layout::convolution::GNDHWC,
ck::tensor_layout::convolution::GKZYXC,
ck::tensor_layout::convolution::GNDHWK,
InDataType, // InDataType
WeiDataType, // WeiDataType
OutDataType, // OutDataType
AccDataType, // AccDataType
InElementOp, // InElementwiseOperation
WeiElementOp, // WeiElementwiseOperation
OutElementOp, // OutElementwiseOperation
ConvBwdWeightDefault, // ConvolutionBackwardWeightSpecialization
256, // BlockSize
128, // MPerBlock
128, // NPerBlock
4, // K0PerBlock
8, // K1
16, // MPerWMMA
16, // NPerWMMA
4, // MRepeat
2, // NRepeat
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
S<0, 2, 1>, // ABlockTransferThreadClusterArrangeOrder
S<0, 2, 1>, // ABlockTransferSrcAccessOrder
1, // ABlockTransferSrcVectorDim
1, // ABlockTransferSrcScalarPerVector
8, // ABlockTransferDstScalarPerVector_AK1
true, // ABlockLdsExtraM
S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
S<0, 2, 1>, // BBlockTransferThreadClusterArrangeOrder
S<0, 2, 1>, // BBlockTransferSrcAccessOrder
1, // BBlockTransferSrcVectorDim
1, // BBlockTransferSrcScalarPerVector
8, // BBlockTransferDstScalarPerVector_BK1
true, // BBlockLdsExtraN
4,
2,
S<1, 32, 1, 8>,
1>;
template <ck::index_t NDimSpatial>
using HostConvBwdWeightInstance = ck::tensor_operation::host::ReferenceConvBwdWeight<NDimSpatial,
InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp>;
#include "run_grouped_conv_bwd_weight_example.inc"
int main(int argc, char* argv[])
{
ExecutionConfig config;
ck::utils::conv::ConvParam conv_param = DefaultConvParam;
if(!parse_cmd_args(argc, argv, config, conv_param))
{
return 1;
}
switch(conv_param.num_dim_spatial_)
{
case 3: return !run_grouped_conv_bwd_weight<3>(config, conv_param);
default: break;
}
return 1;
}
...@@ -67,6 +67,34 @@ using DeviceConvBwdWeightInstance = ...@@ -67,6 +67,34 @@ using DeviceConvBwdWeightInstance =
S<1, 32, 1, 4>, // CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock S<1, 32, 1, 4>, // CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
128 / (sizeof(WeiDataType) * CHAR_BIT)>; // CBlockTransferScalarPerVector_NWaveNPerXdl 128 / (sizeof(WeiDataType) * CHAR_BIT)>; // CBlockTransferScalarPerVector_NWaveNPerXdl
template <ck::index_t NDimSpatial>
using HostConvBwdWeightInstance = ck::tensor_operation::host::ReferenceConvBwdWeight<NDimSpatial,
InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp>;
#include "run_grouped_conv_bwd_weight_example.inc" #include "run_grouped_conv_bwd_weight_example.inc"
int main(int argc, char* argv[]) { return !run_grouped_conv_bwd_weight_example(argc, argv); } int main(int argc, char* argv[])
{
ExecutionConfig config;
ck::utils::conv::ConvParam conv_param = DefaultConvParam;
if(!parse_cmd_args(argc, argv, config, conv_param))
{
return 1;
}
switch(conv_param.num_dim_spatial_)
{
case 1: return !run_grouped_conv_bwd_weight<1>(config, conv_param);
case 2: return !run_grouped_conv_bwd_weight<2>(config, conv_param);
case 3: return !run_grouped_conv_bwd_weight<3>(config, conv_param);
default: break;
}
return 1;
}
...@@ -66,6 +66,34 @@ using DeviceConvBwdWeightInstance = ...@@ -66,6 +66,34 @@ using DeviceConvBwdWeightInstance =
S<1, 32, 1, 4>, // CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock S<1, 32, 1, 4>, // CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
128 / (sizeof(WeiDataType) * CHAR_BIT)>; // CBlockTransferScalarPerVector_NWaveNPerXdl 128 / (sizeof(WeiDataType) * CHAR_BIT)>; // CBlockTransferScalarPerVector_NWaveNPerXdl
template <ck::index_t NDimSpatial>
using HostConvBwdWeightInstance = ck::tensor_operation::host::ReferenceConvBwdWeight<NDimSpatial,
InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp>;
#include "run_grouped_conv_bwd_weight_example.inc" #include "run_grouped_conv_bwd_weight_example.inc"
int main(int argc, char* argv[]) { return !run_grouped_conv_bwd_weight_example(argc, argv); } int main(int argc, char* argv[])
{
ExecutionConfig config;
ck::utils::conv::ConvParam conv_param = DefaultConvParam;
if(!parse_cmd_args(argc, argv, config, conv_param))
{
return 1;
}
switch(conv_param.num_dim_spatial_)
{
case 1: return !run_grouped_conv_bwd_weight<1>(config, conv_param);
case 2: return !run_grouped_conv_bwd_weight<2>(config, conv_param);
case 3: return !run_grouped_conv_bwd_weight<3>(config, conv_param);
default: break;
}
return 1;
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp"
using InDataType = F16;
using WeiDataType = F16;
using OutDataType = F16;
using AccDataType = F32;
using ComputeTypeA = BF8;
using ComputeTypeB = F8;
using InElementOp = PassThrough;
using WeiElementOp = PassThrough;
using OutElementOp = PassThrough;
template <ck::index_t NDimSpatial>
using DeviceConvBwdWeightInstance =
ck::tensor_operation::device::DeviceGroupedConvBwdWeight_Xdl_CShuffle<
NDimSpatial,
ck::tuple_element_t<NDimSpatial - 1,
ck::Tuple<ck::tensor_layout::convolution::GNWC,
ck::tensor_layout::convolution::GNHWC,
ck::tensor_layout::convolution::GNDHWC>>,
ck::tuple_element_t<NDimSpatial - 1,
ck::Tuple<ck::tensor_layout::convolution::GKXC,
ck::tensor_layout::convolution::GKYXC,
ck::tensor_layout::convolution::GKZYXC>>,
ck::tuple_element_t<NDimSpatial - 1,
ck::Tuple<ck::tensor_layout::convolution::GNWK,
ck::tensor_layout::convolution::GNHWK,
ck::tensor_layout::convolution::GNDHWK>>,
InDataType, // InDataType
WeiDataType, // WeiDataType
OutDataType, // OutDataType
AccDataType, // AccDataType
InElementOp, // InElementwiseOperation
WeiElementOp, // WeiElementwiseOperation
OutElementOp, // OutElementwiseOperation
ConvBwdWeightDefault, // ConvolutionBackwardWeightSpecialization
256, // BlockSize
128, // MPerBlock
128, // NPerBlock
4, // K0PerBlock
8, // K1
32, // MPerXdl
32, // NPerXdl
2, // MXdlPerWave
2, // NXdlPerWave
S<1, 4, 16, 4>, // ABlockTransferThreadClusterLengths_K0_M_K1
S<0, 3, 1, 2>, // ABlockTransferThreadClusterArrangeOrder
S<0, 2, 1, 3>, // ABlockTransferSrcAccessOrder
2, // ABlockTransferSrcVectorDim
1, // ABlockTransferSrcScalarPerVector
1, // ABlockTransferDstScalarPerVector_K1
true, // ABlockLdsAddExtraM
S<1, 4, 16, 4>, // BBlockTransferThreadClusterLengths_K0_N_K1
S<0, 3, 1, 2>, // BBlockTransferThreadClusterArrangeOrder
S<0, 2, 1, 3>, // BBlockTransferSrcAccessOrder
2, // BBlockTransferSrcVectorDim
1, // BBlockTransferSrcScalarPerVector
1, // BBlockTransferDstScalarPerVector_K1
true, // BBlockLdsAddExtraN
1, // CShuffleMXdlPerWavePerShuffle
1, // CShuffleNXdlPerWavePerShuffle
S<1, 32, 1, 4>, // CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
2, // CBlockTransferScalarPerVector_NWaveNPerXdl
ComputeTypeA, // ComputeTypeA
ComputeTypeB>; // ComputeTypeB
template <ck::index_t NDimSpatial>
using HostConvBwdWeightInstance = ck::tensor_operation::host::ReferenceConvBwdWeight<NDimSpatial,
InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp,
0,
0,
0,
ComputeTypeA,
ComputeTypeB>;
#include "run_grouped_conv_bwd_weight_example.inc"
int main(int argc, char* argv[])
{
ExecutionConfig config;
ck::utils::conv::ConvParam conv_param = DefaultConvParam;
if(!parse_cmd_args(argc, argv, config, conv_param))
{
return 1;
}
switch(conv_param.num_dim_spatial_)
{
case 1: return !run_grouped_conv_bwd_weight<1>(config, conv_param);
case 2: return !run_grouped_conv_bwd_weight<2>(config, conv_param);
case 3: return !run_grouped_conv_bwd_weight<3>(config, conv_param);
default: break;
}
return 1;
}
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
template <ck::index_t NDimSpatial>
using HostConvBwdWeightInstance = ck::tensor_operation::host::ReferenceConvBwdWeight<NDimSpatial,
InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp>;
template <ck::index_t NDimSpatial> template <ck::index_t NDimSpatial>
bool run_grouped_conv_bwd_weight(const ExecutionConfig& config, bool run_grouped_conv_bwd_weight(const ExecutionConfig& config,
const ck::utils::conv::ConvParam& conv_param) const ck::utils::conv::ConvParam& conv_param)
{ {
ck::index_t split_k; // Dl and WMMA ops don't support split_k > 1
// Set split_k = 2 for xdl op, split_k = 1 for dl constexpr ck::index_t split_k = 1;
// Dl op doesn't support split_k > 1
// TODO: Add Dl op split_k > 1 support
if(!(ck::get_device_name() == "gfx906" || ck::get_device_name() == "gfx1030" ||
ck::get_device_name() == "gfx1100" || ck::get_device_name() == "gfx1101" ||
ck::get_device_name() == "gfx1102"))
{
split_k = 2;
}
else
{
split_k = 1;
}
const auto in_g_n_c_wis_desc = const auto in_g_n_c_wis_desc =
ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed< ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<
...@@ -58,8 +37,8 @@ bool run_grouped_conv_bwd_weight(const ExecutionConfig& config, ...@@ -58,8 +37,8 @@ bool run_grouped_conv_bwd_weight(const ExecutionConfig& config,
out.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5}); out.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
break; break;
default: default:
in.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0}); in.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 0.2});
out.GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.5, 0.5}); out.GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.1, 0.1});
} }
DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize()); DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
...@@ -125,18 +104,7 @@ bool run_grouped_conv_bwd_weight(const ExecutionConfig& config, ...@@ -125,18 +104,7 @@ bool run_grouped_conv_bwd_weight(const ExecutionConfig& config,
return true; return true;
} }
float avg_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel}); invoker.Run(argument, StreamConfig{nullptr, false});
std::size_t flop = conv_param.GetFlops();
std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
float gb_per_sec = num_btype / 1.E6 / avg_time;
std::cerr << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
<< std::endl
<< "DeviceOp: " << conv.GetTypeString() << std::endl;
if(config.do_verification) if(config.do_verification)
{ {
...@@ -151,7 +119,10 @@ bool run_grouped_conv_bwd_weight(const ExecutionConfig& config, ...@@ -151,7 +119,10 @@ bool run_grouped_conv_bwd_weight(const ExecutionConfig& config,
conv_param.input_right_pads_, conv_param.input_right_pads_,
InElementOp{}, InElementOp{},
WeiElementOp{}, WeiElementOp{},
OutElementOp{}); OutElementOp{},
{},
{},
{});
ref_invoker.Run(ref_argument); ref_invoker.Run(ref_argument);
...@@ -160,25 +131,18 @@ bool run_grouped_conv_bwd_weight(const ExecutionConfig& config, ...@@ -160,25 +131,18 @@ bool run_grouped_conv_bwd_weight(const ExecutionConfig& config,
return ck::utils::check_err(wei_device_result.mData, wei_host_result.mData); return ck::utils::check_err(wei_device_result.mData, wei_host_result.mData);
} }
return true; float avg_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
}
bool run_grouped_conv_bwd_weight_example(int argc, char* argv[]) std::size_t flop = conv_param.GetFlops();
{ std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
ExecutionConfig config;
ck::utils::conv::ConvParam conv_param = DefaultConvParam;
if(!parse_cmd_args(argc, argv, config, conv_param)) float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
{
return false;
}
switch(conv_param.num_dim_spatial_) float gb_per_sec = num_btype / 1.E6 / avg_time;
{
case 1: return run_grouped_conv_bwd_weight<1>(config, conv_param); std::cerr << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
case 2: return run_grouped_conv_bwd_weight<2>(config, conv_param); << std::endl
case 3: return run_grouped_conv_bwd_weight<3>(config, conv_param); << "DeviceOp: " << conv.GetTypeString() << std::endl;
}
return false; return true;
} }
if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) add_example_executable(example_gemm_bias_relu_add_layernorm_xdl_welford_fp16 gemm_bias_relu_add_layernorm_xdl_welford_fp16.cpp)
list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942) add_example_executable(example_gemm_bias_relu_add_layernorm_xdl_naive_fp16 gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp)
set(target 0) add_example_executable(example_gemm_layernorm_xdl_naive_fp16 gemm_layernorm_xdl_naive_fp16.cpp)
foreach(gpu IN LISTS GPU_TARGETS) add_example_executable(example_gemm_xdl_layernorm_naive_single_kernel_fp16 gemm_xdl_layernorm_naive_single_kernel_fp16.cpp)
if(gpu IN_LIST gpu_list AND target EQUAL 0)
add_example_executable(example_gemm_bias_relu_add_layernorm_xdl_welford_fp16 gemm_bias_relu_add_layernorm_xdl_welford_fp16.cpp)
add_example_executable(example_gemm_bias_relu_add_layernorm_xdl_naive_fp16 gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp)
add_example_executable(example_gemm_layernorm_xdl_naive_fp16 gemm_layernorm_xdl_naive_fp16.cpp)
add_example_executable(example_gemm_xdl_layernorm_naive_single_kernel_fp16 gemm_xdl_layernorm_naive_single_kernel_fp16.cpp)
set(target 1)
endif()
endforeach()
endif()
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream> #include <iostream>
#include <numeric> #include <numeric>
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp" #include "ck/tensor_operation/gpu/device/impl/device_elementwise_dynamic_vector_dims_impl.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
...@@ -103,9 +103,14 @@ using DeviceNormalizeInstance = ck::tensor_operation::device::DeviceElementwiseI ...@@ -103,9 +103,14 @@ using DeviceNormalizeInstance = ck::tensor_operation::device::DeviceElementwiseI
ck::Tuple<LayerNormOutDataType>, // y ck::Tuple<LayerNormOutDataType>, // y
NormalizeFunctor, NormalizeFunctor,
2, 2,
8, // MPerthread 64, // BlockSize
ck::Sequence<8, 1, 1, 8, 8>, // scalarPerVector: x(gemm_out), mean, meansquare, gamma, beta 16, // MPerBlock
ck::Sequence<8>>; // scalarPerVector: y(layerNorm_out) 16, // NPerBlock
2, // MPerthread
2, // NPerthread
ck::Sequence<1, 0>, // ThreadClusterArrangeOrder
ck::Sequence<2, 1, 1, 2, 2>, // scalarPerVector: x(gemm_out), mean, meansquare, gamma, beta
ck::Sequence<2>>; // scalarPerVector: y(layerNorm_out)
auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) { auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) {
return HostTensorDescriptor({len}, {stride}); return HostTensorDescriptor({len}, {stride});
......
...@@ -114,12 +114,15 @@ void host_gemm_layernorm(Tensor<HDataType>& h_m_n, ...@@ -114,12 +114,15 @@ void host_gemm_layernorm(Tensor<HDataType>& h_m_n,
BetaDataType, BetaDataType,
HDataType, HDataType,
AccDataType, AccDataType,
AccDataType,
HElementOp, HElementOp,
2, 2,
1>; 1>;
Tensor<EMeanVarDataType> e_m_n(HostTensorDescriptor{M, N}); Tensor<EMeanVarDataType> e_m_n(HostTensorDescriptor{M, N});
Tensor<AccDataType> c_m_n(HostTensorDescriptor{M, N}); Tensor<AccDataType> c_m_n(HostTensorDescriptor{M, N});
Tensor<AccDataType> save_mean({M});
Tensor<AccDataType> save_inv_std({M});
auto ref_gemm = ReferenceGemm{}; auto ref_gemm = ReferenceGemm{};
auto ref_gemm_invoker = ref_gemm.MakeInvoker(); auto ref_gemm_invoker = ref_gemm.MakeInvoker();
...@@ -145,7 +148,7 @@ void host_gemm_layernorm(Tensor<HDataType>& h_m_n, ...@@ -145,7 +148,7 @@ void host_gemm_layernorm(Tensor<HDataType>& h_m_n,
auto ref_layernorm_invoker = ref_layernorm.MakeInvoker(); auto ref_layernorm_invoker = ref_layernorm.MakeInvoker();
auto ref_layernorm_argument = ref_layernorm.MakeArgument( auto ref_layernorm_argument = ref_layernorm.MakeArgument(
e_m_n, gamma_n, beta_n, h_m_n, h_element_op, {M, N}, {1}, epsilon); e_m_n, gamma_n, beta_n, h_m_n, save_mean, save_inv_std, h_element_op, {M, N}, {1}, epsilon);
ref_layernorm_invoker.Run(ref_layernorm_argument); ref_layernorm_invoker.Run(ref_layernorm_argument);
} }
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream> #include <iostream>
#include <numeric> #include <numeric>
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp" #include "ck/tensor_operation/gpu/device/impl/device_elementwise_dynamic_vector_dims_impl.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
...@@ -102,9 +102,14 @@ using DeviceNormalizeInstance = ck::tensor_operation::device::DeviceElementwiseI ...@@ -102,9 +102,14 @@ using DeviceNormalizeInstance = ck::tensor_operation::device::DeviceElementwiseI
ck::Tuple<LayerNormOutDataType>, // y ck::Tuple<LayerNormOutDataType>, // y
NormalizeFunctor, NormalizeFunctor,
2, 2,
8, // MPerthread 64, // BlockSize
ck::Sequence<8, 1, 1, 8, 8>, // scalarPerVector: x(gemm_out), mean, meansquare, gamma, beta 16, // MPerBlock
ck::Sequence<8>>; // scalarPerVector: y(layerNorm_out) 16, // NPerBlock
2, // MPerthread
2, // NPerthread
ck::Sequence<1, 0>, // ThreadClusterArrangeOrder
ck::Sequence<2, 1, 1, 2, 2>, // scalarPerVector: x(gemm_out), mean, meansquare, gamma, beta
ck::Sequence<2>>; // scalarPerVector: y(layerNorm_out)
auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) { auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) {
return HostTensorDescriptor({len}, {stride}); return HostTensorDescriptor({len}, {stride});
......
add_custom_target(example_cgemm_xdl) add_custom_target(example_cgemm_xdl)
if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES) add_example_executable(example_cgemm_xdl_bf16 cgemm_xdl_bf16.cpp)
add_example_executable(example_cgemm_xdl_bf16 cgemm_xdl_bf16.cpp) add_example_dependencies(example_cgemm_xdl example_cgemm_xdl_bf16)
add_dependencies(example_cgemm_xdl example_cgemm_xdl_bf16)
endif() add_example_executable(example_cgemm_xdl_fp16 cgemm_xdl_fp16.cpp)
if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) add_example_dependencies(example_cgemm_xdl example_cgemm_xdl_fp16)
add_example_executable(example_cgemm_xdl_fp16 cgemm_xdl_fp16.cpp)
add_dependencies(example_cgemm_xdl example_cgemm_xdl_fp16)
endif()
if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES)
add_example_executable(example_cgemm_xdl_fp32 cgemm_xdl_fp32.cpp) add_example_executable(example_cgemm_xdl_fp32 cgemm_xdl_fp32.cpp)
add_dependencies(example_cgemm_xdl example_cgemm_xdl_fp32) add_example_dependencies(example_cgemm_xdl example_cgemm_xdl_fp32)
endif()
if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES) add_example_executable(example_cgemm_xdl_int8 cgemm_xdl_int8.cpp)
add_example_executable(example_cgemm_xdl_int8 cgemm_xdl_int8.cpp) add_example_dependencies(example_cgemm_xdl example_cgemm_xdl_int8)
add_dependencies(example_cgemm_xdl example_cgemm_xdl_int8)
endif()
if(USE_BITINT_EXTENSION_INT4) if(USE_BITINT_EXTENSION_INT4)
add_example_executable(example_cgemm_xdl_int4 cgemm_xdl_int4.cpp) add_example_executable(example_cgemm_xdl_int4 cgemm_xdl_int4.cpp)
add_dependencies(example_cgemm_xdl example_cgemm_xdl_int4) add_example_dependencies(example_cgemm_xdl example_cgemm_xdl_int4)
endif() endif()
add_custom_target(example_batched_gemm_xdl) add_custom_target(example_batched_gemm_xdl)
if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES)
add_example_executable(example_batched_gemm_xdl_fp32 batched_gemm_xdl_fp32.cpp) add_example_executable(example_batched_gemm_xdl_fp32 batched_gemm_xdl_fp32.cpp)
add_dependencies(example_batched_gemm_xdl example_batched_gemm_xdl_fp32) add_example_dependencies(example_batched_gemm_xdl example_batched_gemm_xdl_fp32)
endif()
if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) add_example_executable(example_batched_gemm_xdl_fp16 batched_gemm_xdl_fp16.cpp)
add_example_executable(example_batched_gemm_xdl_fp16 batched_gemm_xdl_fp16.cpp) add_example_dependencies(example_batched_gemm_xdl example_batched_gemm_xdl_fp16)
add_dependencies(example_batched_gemm_xdl example_batched_gemm_xdl_fp16)
endif() add_example_executable(example_batched_gemm_xdl_bf16 batched_gemm_xdl_bf16.cpp)
if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES) add_example_dependencies(example_batched_gemm_xdl example_batched_gemm_xdl_bf16)
add_example_executable(example_batched_gemm_xdl_bfp16 batched_gemm_xdl_bfp16.cpp)
add_dependencies(example_batched_gemm_xdl example_batched_gemm_xdl_bfp16) add_example_executable(example_batched_gemm_xdl_bf16_v3 batched_gemm_xdl_bf16_v3.cpp)
endif() add_example_dependencies(example_batched_gemm_xdl example_batched_gemm_xdl_bf16_v3)
if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES)
add_example_executable(example_batched_gemm_xdl_int8 batched_gemm_xdl_int8.cpp) add_example_executable(example_batched_gemm_xdl_fp8_rowwise_v3 batched_gemm_xdl_fp8_rowwise_v3.cpp)
add_dependencies(example_batched_gemm_xdl example_batched_gemm_xdl_int8) add_example_dependencies(example_batched_gemm_xdl example_batched_gemm_xdl_fp8_rowwise_v3)
endif()
add_example_executable(example_batched_gemm_xdl_int8 batched_gemm_xdl_int8.cpp)
add_example_dependencies(example_batched_gemm_xdl example_batched_gemm_xdl_int8)
if(USE_BITINT_EXTENSION_INT4) if(USE_BITINT_EXTENSION_INT4)
add_example_executable(example_batched_gemm_xdl_int4 batched_gemm_xdl_int4.cpp) add_example_executable(example_batched_gemm_xdl_int4 batched_gemm_xdl_int4.cpp)
add_dependencies(example_batched_gemm_xdl example_batched_gemm_xdl_int4) add_example_dependencies(example_batched_gemm_xdl example_batched_gemm_xdl_int4)
endif() endif()
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
#include "ck/library/utility/literals.hpp"
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using BF16 = ck::bhalf_t;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using ADataType = BF16;
using BDataType = BF16;
using AccDataType = F32;
using CShuffleDataType = BF16;
using DsDataType = ck::Tuple<>;
using EDataType = BF16;
using ALayout = Row;
using BLayout = Col;
using DsLayout = ck::Tuple<>;
using ELayout = Row;
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CDEElementOp = PassThrough;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<
ALayout,
BLayout,
DsLayout,
ELayout,
ADataType,
BDataType,
DsDataType,
EDataType,
AccDataType,
CShuffleDataType,
AElementOp,
BElementOp,
CDEElementOp,
GemmDefault,
256, // BlockSize
256, // MPerBlock
128, // NPerBlock
32, // KPerBlock
8, // AK1
8, // BK1
32, // MPerXDL
32, // NPerXDL
4, // MXdlPerWave
2, // NXdlPerWave
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
2, // ABlockTransferSrcVectorDim
8, // ABlockTransferSrcScalarPerVector
8, // ABlockTransferDstScalarPerVector_AK1
1, // ABlockLdsExtraM
S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // BBlockTransferSrcAccessOrder
2, // BBlockTransferSrcVectorDim
8, // BBlockTransferSrcScalarPerVector
8, // BBlockTransferDstScalarPerVector_BK1
1, // BBlockLdsExtraN
1, // CShuffleMXdlPerWavePerShuffle
1, // CShuffleNXdlPerWavePerShuffle
S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
S<8>, // CDEShuffleBlockTransferScalarPerVectors
ck::BlockGemmPipelineScheduler::Intrawave, // BlockGemmPipelineScheduler
ck::BlockGemmPipelineVersion::v3 // BlockGemmPipelineVersion
>;
#include "run_batched_gemm_example.inc"
int main(int argc, char* argv[]) { return !run_batched_gemm_example(argc, argv); }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment